import json
import string
from datasets import Dataset, DatasetDict, load_from_disk
import seaborn as sns
import matplotlib.pyplot as plt
from juddges.settings import DATA_PATH
Analyze Text of England and Wales Judgements
= DATA_PATH / "datasets" / "en"
path_ = path_ / "england_wales_data_refined_7.jsonl"
jsonl_file = path_ / "en_judgements_dataset" dataset_path
= []
data with open(jsonl_file, 'r') as file:
for line in file:
data.append(json.loads(line))
= Dataset.from_json(jsonl_file)
dataset = DatasetDict({"train": dataset})
dataset_dict
# Save the dataset to disk
dataset.save_to_disk(dataset_path)
# Load the dataset from disk
= load_from_disk(dataset_path) ds
def tagger(item):
= item["content"]
text = text.split()
dummy_tokens
"chars"] = len(text)
item["num_dummy_tokens"] = len(dummy_tokens)
item["num_non_ws_tokens"] = sum(
item[1 for tok in dummy_tokens if any(char not in string.punctuation for char in tok.strip())
)
return item
= ds.map(tagger, num_proc=8)
ds ds.cleanup_cache_files()
0
= (
stats "_id", "type", "appeal_type", "appeal_outcome", "chars", "num_dummy_tokens", "num_non_ws_tokens"])
ds.select_columns([
.to_pandas()="pyarrow")
.convert_dtypes(dtype_backend
)"type"] = stats["type"].astype("category")
stats[ stats.head()
_id | type | appeal_type | appeal_outcome | chars | num_dummy_tokens | num_non_ws_tokens | |
---|---|---|---|---|---|---|---|
0 | ab0224364e4cf6562c82f8861d5268d4fa22b2ec45e0f7... | crown_court | <NA> | <NA> | 12444 | 2229 | 2155 |
1 | d4630d93258ea51ecff4bc4015443b4eecf8d9b2e5b7c5... | supreme_court | conviction | <NA> | 20977 | 3681 | 3586 |
2 | 37183a714b626cfe98081ac0250c804f992f340281f6d2... | crown_court | <NA> | <NA> | 40570 | 7199 | 7097 |
3 | b41933b19505ab8767ce30faf8db9524f737ec5ac2c17e... | crown_court | <NA> | <NA> | 19459 | 3515 | 3432 |
4 | 418382a2a6c0c32d3d2bd4cb7b39e1ba259dc6bf56a78e... | crown_court | <NA> | allowed | 10352 | 1879 | 1793 |
= sns.histplot(
ax =stats["num_non_ws_tokens"],
x=True,
log_scale=50,
bins
)set(title="#tokens distribution") ax.
= stats["type"].value_counts().index.tolist()
court_type_card_order = stats["type"].value_counts().plot.barh(logx=True, title="Types cardinality") court_type_data
= stats["appeal_type"].value_counts().index.tolist()
appeal_type_card_order = stats["appeal_type"].value_counts().plot.barh(logx=True, title="Types cardinality") appeal_type_data
= stats["appeal_outcome"].value_counts().index.tolist()
appeal_outcome_card_order = stats["appeal_outcome"].value_counts().plot.barh(logx=True, title="Types cardinality") appeal_outcome_data
# sns.displot(data=stats, x="num_non_ws_tokens", col="type", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind="hist", bins=25)
= plt.subplots(figsize=(8, 12))
_, ax set(title="Per type text length ditribution")
ax.=stats, y="type", x="num_non_ws_tokens", order=court_type_card_order, log_scale=True) sns.boxenplot(data
# sns.displot(data=stats, x="num_non_ws_tokens", col="type", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind="hist", bins=25)
= plt.subplots(figsize=(8, 12))
_, ax set(title="Per type text length ditribution")
ax.=stats, y="appeal_type", x="num_non_ws_tokens", order=appeal_type_card_order, log_scale=True) sns.boxenplot(data
# sns.displot(data=stats, x="num_non_ws_tokens", col="type", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind="hist", bins=25)
= plt.subplots(figsize=(8, 12))
_, ax set(title="Per type text length ditribution")
ax.=stats, y="appeal_outcome", x="num_non_ws_tokens", order=appeal_outcome_card_order, log_scale=True) sns.boxenplot(data
Tokenize
from transformers import AutoTokenizer
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
= AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")
tokenizer = ds.map(
ds lambda examples: tokenizer(examples["content"], padding=False, truncation=False),
=True,
batched=44,
num_proc )
= []
tokenized for item in ds:
"num_tokens": len(item["input_ids"])}) tokenized.append({
= [item['num_tokens'] for item in tokenized]
num_tokens = [token for token in num_tokens if token <= 40000] filtered_tokens
=100) sns.histplot(filtered_tokens, bins
# Plot the box plot
=(6, 6))
plt.figure(figsize
sns.boxplot(filtered_tokens) plt.show()