import polars as pl
from datasets import load_from_disk
from juddges.settings import DATA_PATH
Analyse England and Wales Dataset
= DATA_PATH / "datasets" / "en"
path_ = path_ / "en_judgements_dataset"
dataset_path = load_from_disk(dataset_path) ds
ds
Dataset({
features: ['_id', 'citation', 'signature', 'date', 'publicationDate', 'type', 'excerpt', 'content', 'judges', 'caseNumbers', 'citation_references', 'legislation', 'file_name', 'appeal_type', 'appeal_outcome', 'xml_uri', 'uri'],
num_rows: 6154
})
= ds.to_pandas()
df = pl.DataFrame(df) pl_df
= pl_df.with_columns([
pl_df "date").cast(pl.Utf8),
pl.col("publicationDate").cast(pl.Utf8),
pl.col(
])
# Define date format
= "%Y-%m-%d %H:%M:%S%.f %Z"
dt_fmt
# Perform column transformations
= pl_df.with_columns([
pl_df "date").str.strptime(pl.Datetime, format=dt_fmt),
pl.col("publicationDate").str.strptime(pl.Datetime, format=dt_fmt),
pl.col("type").cast(pl.Categorical),
pl.col("appeal_type").cast(pl.Categorical),
pl.col("appeal_outcome").cast(pl.Categorical)
pl.col(
])
# Display the first few rows of the transformed DataFrame
print(pl_df.head())
shape: (5, 17)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ _id ┆ citation ┆ signature ┆ date ┆ … ┆ appeal_ty ┆ appeal_ou ┆ xml_uri ┆ uri │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ pe ┆ tcome ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ datetime[ ┆ ┆ --- ┆ --- ┆ str ┆ str │
│ ┆ ┆ ┆ ns] ┆ ┆ cat ┆ cat ┆ ┆ │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ ab0224364 ┆ [2008] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │
│ e4cf6562c ┆ EWCA Crim ┆ _2952 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │
│ 82f8861d5 ┆ 2952 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │
│ 268d4… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hives… ┆ archives │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │
│ d4630d932 ┆ [2006] ┆ EWCA_Crim ┆ null ┆ … ┆ convictio ┆ null ┆ https://c ┆ https:// │
│ 58ea51ecf ┆ EWCA Crim ┆ _3187 ┆ ┆ ┆ n ┆ ┆ aselaw.na ┆ caselaw. │
│ f4bc40154 ┆ 3187 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │
│ 43b4e… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hives… ┆ archives │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │
│ 37183a714 ┆ [2012] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │
│ b626cfe98 ┆ EWCA Crim ┆ _1840 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │
│ 081ac0250 ┆ 1840 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │
│ c804f… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hives… ┆ archives │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │
│ b41933b19 ┆ [2014] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ null ┆ https://c ┆ https:// │
│ 505ab8767 ┆ EWCA Crim ┆ _1730 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │
│ ce30faf8d ┆ 1730 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │
│ b9524… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hives… ┆ archives │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │
│ 418382a2a ┆ [2018] ┆ EWCA_Crim ┆ null ┆ … ┆ null ┆ allowed ┆ https://c ┆ https:// │
│ 6c0c32d3d ┆ EWCA Crim ┆ _2189 ┆ ┆ ┆ ┆ ┆ aselaw.na ┆ caselaw. │
│ 2bd4cb7b3 ┆ 2189 ┆ ┆ ┆ ┆ ┆ ┆ tionalarc ┆ national │
│ 9e1ba… ┆ ┆ ┆ ┆ ┆ ┆ ┆ hives… ┆ archives │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘
pl_df.describe()
shape: (9, 18)
statistic | _id | citation | signature | date | publicationDate | type | excerpt | content | judges | caseNumbers | citation_references | legislation | file_name | appeal_type | appeal_outcome | xml_uri | uri |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | str | str | str | str | str | str | str | str | f64 | f64 | f64 | f64 | str | str | str | str | str |
"count" | "6154" | "6154" | "6154" | "0" | "6154" | "6154" | "6058" | "6154" | 6115.0 | 4934.0 | 1392.0 | 1826.0 | "6154" | "834" | "1368" | "6154" | "6154" |
"null_count" | "0" | "0" | "0" | "6154" | "0" | "0" | "96" | "0" | 39.0 | 1220.0 | 4762.0 | 4328.0 | "0" | "5320" | "4786" | "0" | "0" |
"mean" | null | null | null | null | "2013-10-13 09:… | null | null | null | null | null | null | null | null | null | null | null | null |
"std" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null |
"min" | "001d3b389f60bf… | "[2003] EWCA Cr… | "EWCA_(Crim)_14… | null | "2003-01-04 00:… | null | "********REPORT… | " 2020] EWCA Cr… | null | null | null | null | "2003_01_04-1.x… | null | null | "https://casela… | "https://casela… |
"25%" | null | null | null | null | "2008-06-11 00:… | null | null | null | null | null | null | null | null | null | null | null | null |
"50%" | null | null | null | null | "2012-11-29 00:… | null | null | null | null | null | null | null | null | null | null | null | null |
"75%" | null | null | null | null | "2019-06-07 00:… | null | null | null | null | null | null | null | null | null | null | null | null |
"max" | "ffffb6552ad898… | "[2024] EWCA Cr… | "Ewca_Crim_664" | null | "2024-05-22 00:… | null | "…WARNING: repo… | "…WARNING: repo… | null | null | null | null | "2024_05_22-615… | null | null | "https://casela… | "https://casela… |
"type"].value_counts() pl_df[
shape: (7, 2)
type | count |
---|---|
cat | u32 |
"crown_court" | 5472 |
"supreme_court" | 660 |
"martial_court" | 11 |
"high_court_adm… | 2 |
"high_court_div… | 7 |
"civil_criminal… | 1 |
"division_court… | 1 |
"appeal_type"].value_counts() pl_df[
shape: (3, 2)
appeal_type | count |
---|---|
cat | u32 |
"conviction" | 496 |
null | 5320 |
"sentence" | 338 |
"appeal_outcome"].value_counts() pl_df[
shape: (5, 2)
appeal_outcome | count |
---|---|
cat | u32 |
null | 4786 |
"granted" | 20 |
"dismissed" | 586 |
"refused" | 65 |
"allowed" | 697 |
print(f"Missing content: {pl_df['content'].null_count() / len(pl_df)}")
print(f"Missing excerpt: {pl_df['excerpt'].null_count() / len(pl_df)}")
Missing content: 0.0
Missing excerpt: 0.015599610009749756
"excerpt"].str.strip_chars().str.len_chars().to_pandas().plot.hist(
pl_df[=50, log=True, title="Excerpt #chars distribution"
bins )
"excerpt"] pl_df[
shape: (6_154,)
excerpt |
---|
str |
"No. 2008/03296… |
"Neutral Citati… |
"Neutral Citati… |
"Neutral Citati… |
"No: 201802356 … |
… |
"Neutral Citati… |
"Case No: 2002/… |
"Neutral Citati… |
"Case No: 20030… |
"2017/05382/B1 … |
"excerpt"].str.strip_chars().str.len_chars().to_pandas() pl_df[
0 500.0
1 500.0
2 499.0
3 500.0
4 499.0
...
6149 499.0
6150 500.0
6151 500.0
6152 499.0
6153 499.0
Name: excerpt, Length: 6154, dtype: float64