import pandas as pd
import osIn [1]:
In [4]:
path2SP = "/Users/ctoruno/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data"
eu_member_states = [
"Austria","Belgium","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia",
"Finland","France","Germany","Greece","Hungary","Ireland","Italy","Latvia",
"Lithuania","Luxembourg","Malta","Netherlands","Poland","Portugal","Romania","Slovakia",
"Slovenia","Spain","Sweden"
]Table 1: Extracted Data
In [3]:
data_tbl01 = {
"Country": ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia", "Finland", "France",
"Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta",
"Netherlands", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden"],
"Total News Extracted": [46145, 21287, 38118, 37068, 34955, 41415, 11312, 12370, 6647, 64527,
45321, 49504, 18111, 48409, 93858, 5487, 14396, 7894, 10842,
23935, 21434, 29624, 33264, 34874, 10211, 112820, 6417],
"Date Range": ["Mar 06, 2023 - Mar 07, 2024", "Jun 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 07, 2024",
"Jul 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 08, 2024", "Jul 07, 2023 - Mar 08, 2024",
"Aug 07, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 07, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 09, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024"]
}
tbl01 = pd.DataFrame(data_tbl01)
total_row = pd.DataFrame([["Total", sum(data_tbl01["Total News Extracted"]), ""]], columns=tbl01.columns)
tbl01_total = pd.concat([tbl01, total_row], ignore_index=True)In [4]:
In [4]:
tbl01_total.style.hide(axis="index").format({
'Total News Extracted': '{:,.0f}'
})| Country | Total News Extracted | Date Range |
|---|---|---|
| Austria | 46,145 | Mar 06, 2023 - Mar 07, 2024 |
| Belgium | 21,287 | Jun 07, 2023 - Mar 07, 2024 |
| Bulgaria | 38,118 | Jul 07, 2023 - Mar 07, 2024 |
| Croatia | 37,068 | Jul 07, 2023 - Mar 07, 2024 |
| Cyprus | 34,955 | Jul 07, 2023 - Mar 08, 2024 |
| Czechia | 41,415 | Jul 07, 2023 - Mar 08, 2024 |
| Denmark | 11,312 | Aug 07, 2023 - Mar 07, 2024 |
| Estonia | 12,370 | Aug 08, 2023 - Mar 07, 2024 |
| Finland | 6,647 | Aug 08, 2023 - Mar 07, 2024 |
| France | 64,527 | Aug 07, 2023 - Mar 07, 2024 |
| Germany | 45,321 | Jan 08, 2024 - Mar 07, 2024 |
| Greece | 49,504 | Aug 08, 2023 - Mar 07, 2024 |
| Hungary | 18,111 | Aug 08, 2023 - Mar 07, 2024 |
| Ireland | 48,409 | Aug 08, 2023 - Mar 07, 2024 |
| Italy | 93,858 | Aug 08, 2023 - Mar 07, 2024 |
| Latvia | 5,487 | Aug 09, 2023 - Mar 07, 2024 |
| Lithuania | 14,396 | Aug 08, 2023 - Mar 07, 2024 |
| Luxembourg | 7,894 | Aug 08, 2023 - Mar 07, 2024 |
| Malta | 10,842 | Aug 08, 2023 - Mar 07, 2024 |
| Netherlands | 23,935 | Aug 08, 2023 - Mar 07, 2024 |
| Poland | 21,434 | Aug 08, 2023 - Mar 07, 2024 |
| Portugal | 29,624 | Aug 08, 2023 - Mar 07, 2024 |
| Romania | 33,264 | Aug 08, 2023 - Mar 07, 2024 |
| Slovakia | 34,874 | Aug 08, 2023 - Mar 07, 2024 |
| Slovenia | 10,211 | Aug 08, 2023 - Mar 07, 2024 |
| Spain | 112,820 | Aug 08, 2023 - Mar 07, 2024 |
| Sweden | 6,417 | Jan 08, 2024 - Mar 07, 2024 |
| Total | 880,245 |
Table 2: Failed Translation
In [5]:
path2exdata = f"{path2SP}/data-extraction-1/data4translation"
extracted_data = [pd.read_parquet(f"{path2exdata}/{country}_tp.parquet.gzip") for country in eu_member_states]
exdata = pd.concat(extracted_data)In [24]:
path2trdata = f"{path2SP}/data-extraction-1/ready4class"
files = os.listdir(path2trdata)
translated_data = [pd.read_parquet(f"{path2trdata}/{x}") for x in files]
trdata = pd.concat(translated_data)In [45]:
In [45]:
tbl02 = (
exdata.country.value_counts().reset_index().sort_values("country")
.merge(
trdata.country.value_counts().reset_index(),
how = "left",
on = "country"
)
.assign(
success = lambda df: (1-((df['count_x'] - df['count_y']) / df['count_x']))*100
)
.rename(
columns = {
"country": "Country",
"count_x": "Extracted News (n)",
"count_y": "Translated News (n)",
"success": "Translated News (%)"
},
# inplace = True
)
)
total_row = pd.DataFrame(
[[
"European Union",
tbl02["Extracted News (n)"].sum(),
tbl02["Translated News (n)"].sum(),
tbl02["Translated News (%)"].mean(),
]],
columns=tbl02.columns
)
tbl02_total = pd.concat([tbl02, total_row], ignore_index=True)
tbl02_total.style.hide(axis="index").format({
"Extracted News (n)": "{:,.0f}",
"Translated News (n)": "{:,.0f}",
"Translated News (%)": "{:,.1f}"
})| Country | Extracted News (n) | Translated News (n) | Translated News (%) |
|---|---|---|---|
| Austria | 46,145 | 44,181 | 95.7 |
| Belgium | 21,287 | 19,846 | 93.2 |
| Bulgaria | 38,118 | 33,070 | 86.8 |
| Croatia | 37,068 | 36,595 | 98.7 |
| Cyprus | 34,955 | 33,772 | 96.6 |
| Czechia | 41,415 | 40,620 | 98.1 |
| Denmark | 11,312 | 10,761 | 95.1 |
| Estonia | 12,370 | 11,334 | 91.6 |
| Finland | 6,647 | 6,399 | 96.3 |
| France | 64,527 | 62,900 | 97.5 |
| Germany | 45,321 | 40,444 | 89.2 |
| Greece | 49,504 | 47,182 | 95.3 |
| Hungary | 18,111 | 17,965 | 99.2 |
| Ireland | 48,409 | 48,409 | 100.0 |
| Italy | 93,858 | 82,885 | 88.3 |
| Latvia | 5,487 | 5,467 | 99.6 |
| Lithuania | 14,396 | 13,287 | 92.3 |
| Luxembourg | 7,894 | 7,636 | 96.7 |
| Malta | 10,842 | 10,556 | 97.4 |
| Netherlands | 23,935 | 22,642 | 94.6 |
| Poland | 21,434 | 17,490 | 81.6 |
| Portugal | 29,624 | 29,416 | 99.3 |
| Romania | 33,264 | 32,234 | 96.9 |
| Slovakia | 34,874 | 28,973 | 83.1 |
| Slovenia | 10,211 | 9,818 | 96.2 |
| Spain | 112,820 | 88,324 | 78.3 |
| Sweden | 6,417 | 6,223 | 97.0 |
| European Union | 880,245 | 808,429 | 93.9 |
Table 3: First Stage Classification
In [5]:
path2cldata = f"{path2SP}/data-classification-1/0_compiled"
mutate_dict = {
"Unrelated" : 0,
"Related" : 1,
"Justice" : 1,
"Governance" : 1,
"Rule of Law" : 1,
"Skipped article": 2
}In [6]:
cldata = [pd.read_parquet(f"{path2cldata}/{country}_classified.parquet.gzip") for country in eu_member_states]
cldata_master = pd.concat(cldata)In [75]:
data = []
for country in eu_member_states:
file = f"{path2cldata}/{country}_classified.parquet.gzip"
df = pd.read_parquet(file)
df["topic_related"] = df["topic_related"].str.strip()
df["relation"] = df["topic_related"].replace(mutate_dict)
if country == "Czechia":
location_column = "location_Czech"
else:
location_column = f"location_{country}"
df["related_within"] = df.apply(lambda row: row[location_column] and row["relation"] == 1, axis=1)
total = len(df)
related = (df.relation.value_counts()[1] / total)*100
within = (df.related_within.value_counts()[True] / total)*100
within_ = (df.related_within.value_counts()[True])
skipped = (df.relation.value_counts()[2] / total)*100
dict = {
"Country": country,
"Related (%)" : related,
"Related - Within (%)": within,
"Related - Within (n)": within_,
"Unclassified (%)": skipped
}
data.append(dict)In [84]:
tbl03 = pd.DataFrame(data)
total_row = pd.DataFrame(
[[
"European Union",
tbl02["Related (%)"].mean(),
tbl02["Related - Within (%)"].mean(),
tbl02["Related - Within (n)"].sum(),
tbl02["Unclassified (%)"].mean(),
]],
columns=tbl03.columns
)
tbl03_total = pd.concat([tbl03, total_row], ignore_index=True)In [85]:
In [85]:
tbl03_total.style.hide(axis="index").format({
"Related (%)": "{:,.1f}",
"Related - Within (%)": "{:,.1f}",
"Related - Within (n)": "{:,.0f}",
"Unclassified (%)": "{:,.1f}"
})| Country | Related (%) | Related - Within (%) | Related - Within (n) | Unclassified (%) |
|---|---|---|---|---|
| Austria | 24.9 | 12.7 | 5,590 | 0.5 |
| Belgium | 24.2 | 13.8 | 2,737 | 1.0 |
| Bulgaria | 35.1 | 24.4 | 8,071 | 0.3 |
| Croatia | 28.8 | 17.6 | 6,431 | 0.8 |
| Cyprus | 29.1 | 19.9 | 6,716 | 0.3 |
| Czechia | 21.8 | 17.5 | 7,114 | 1.3 |
| Denmark | 30.6 | 17.6 | 1,897 | 0.9 |
| Estonia | 22.5 | 14.1 | 1,599 | 0.3 |
| Finland | 32.8 | 23.5 | 1,501 | 0.8 |
| France | 26.5 | 15.8 | 9,955 | 0.7 |
| Germany | 29.4 | 23.0 | 9,292 | 0.8 |
| Greece | 26.6 | 17.1 | 8,027 | 1.2 |
| Hungary | 25.8 | 15.4 | 2,768 | 0.8 |
| Ireland | 8.0 | 5.5 | 2,676 | 0.2 |
| Italy | 29.1 | 24.1 | 18,800 | 0.8 |
| Latvia | 15.1 | 11.6 | 636 | 0.6 |
| Lithuania | 31.5 | 24.6 | 3,272 | 0.7 |
| Luxembourg | 26.7 | 14.7 | 1,119 | 0.5 |
| Malta | 37.6 | 30.4 | 3,205 | 0.4 |
| Netherlands | 24.8 | 14.2 | 3,223 | 0.6 |
| Poland | 40.7 | 33.7 | 5,902 | 0.8 |
| Portugal | 23.2 | 14.8 | 4,217 | 0.3 |
| Romania | 43.6 | 27.2 | 8,765 | 1.0 |
| Slovakia | 26.8 | 17.9 | 5,172 | 0.4 |
| Slovenia | 30.7 | 18.2 | 1,786 | 0.7 |
| Spain | 27.7 | 18.9 | 16,685 | 1.0 |
| Sweden | 26.7 | 15.6 | 968 | 2.5 |
| European Union | 27.8 | 18.7 | 148,124 | 0.8 |
Table 4: Second Stage Classification
In [27]:
tbl04_data = (
cldata_master.copy()
.drop_duplicates(subset = "id")
.loc[:,["country", "pillar_1", "pillar_2", "pillar_3", "pillar_4", "pillar_5", "pillar_6", "pillar_7", "pillar_8"]]
.groupby("country")
.agg("mean")
.apply(lambda x: x*100)
.reset_index()
.rename(
columns = {
"country" : "Country",
"pillar_1": "Pillar 1",
"pillar_2": "Pillar 2",
"pillar_3": "Pillar 3",
"pillar_4": "Pillar 4",
"pillar_5": "Pillar 5",
"pillar_6": "Pillar 6",
"pillar_7": "Pillar 7",
"pillar_8": "Pillar 8",
}
)
)
total_row = pd.DataFrame(
[["European Union"] + tbl04_data.iloc[:, 1:].mean().values.tolist()],
columns=tbl04_data.columns
)
tbl04_total = pd.concat([tbl04_data, total_row], ignore_index=True)In [28]:
In [28]:
tbl04_total.style.hide(axis="index").format(precision=1)| Country | Pillar 1 | Pillar 2 | Pillar 3 | Pillar 4 | Pillar 5 | Pillar 6 | Pillar 7 | Pillar 8 |
|---|---|---|---|---|---|---|---|---|
| Austria | 18.9 | 9.8 | 2.2 | 15.3 | 5.8 | 2.5 | 6.1 | 15.8 |
| Belgium | 13.3 | 9.0 | 1.4 | 12.8 | 5.8 | 1.9 | 6.5 | 17.1 |
| Bulgaria | 27.4 | 16.7 | 3.6 | 17.8 | 8.1 | 5.2 | 9.2 | 21.3 |
| Croatia | 20.6 | 13.0 | 2.3 | 15.6 | 8.1 | 3.0 | 6.5 | 19.8 |
| Cyprus | 22.6 | 13.7 | 3.9 | 16.2 | 7.5 | 6.1 | 7.9 | 15.6 |
| Czechia | 14.7 | 9.1 | 1.7 | 10.8 | 6.6 | 2.7 | 4.6 | 14.6 |
| Denmark | 24.3 | 12.8 | 3.0 | 20.2 | 9.2 | 3.0 | 8.2 | 20.5 |
| Estonia | 16.9 | 8.9 | 2.0 | 11.7 | 4.3 | 3.4 | 5.9 | 13.2 |
| Finland | 24.2 | 12.9 | 1.9 | 20.3 | 10.2 | 3.6 | 8.2 | 22.5 |
| France | 20.5 | 9.7 | 2.3 | 18.5 | 7.1 | 3.1 | 5.6 | 15.3 |
| Germany | 20.9 | 9.5 | 2.1 | 17.7 | 9.2 | 2.5 | 7.0 | 19.4 |
| Greece | 20.2 | 12.1 | 2.6 | 16.7 | 8.6 | 3.9 | 6.3 | 17.4 |
| Hungary | 21.8 | 13.2 | 2.6 | 16.4 | 5.9 | 3.6 | 4.0 | 14.0 |
| Ireland | 18.6 | 9.4 | 2.7 | 16.5 | 7.0 | 3.7 | 8.0 | 16.2 |
| Italy | 17.9 | 12.3 | 2.5 | 17.2 | 8.2 | 4.1 | 7.4 | 18.4 |
| Latvia | 10.4 | 6.1 | 2.0 | 7.5 | 3.1 | 3.3 | 2.1 | 6.2 |
| Lithuania | 22.7 | 14.1 | 2.4 | 15.3 | 6.6 | 4.4 | 7.7 | 20.7 |
| Luxembourg | 16.4 | 8.3 | 3.8 | 15.8 | 6.6 | 3.7 | 5.8 | 13.1 |
| Malta | 30.1 | 19.3 | 7.4 | 24.0 | 6.4 | 9.2 | 13.0 | 21.4 |
| Netherlands | 16.6 | 9.6 | 2.6 | 15.5 | 5.6 | 3.0 | 6.0 | 14.8 |
| Poland | 31.6 | 18.1 | 3.8 | 23.1 | 7.6 | 5.6 | 11.4 | 23.5 |
| Portugal | 18.6 | 9.5 | 3.0 | 13.6 | 3.1 | 3.6 | 5.2 | 10.6 |
| Romania | 28.6 | 23.3 | 4.1 | 22.5 | 9.5 | 7.0 | 8.1 | 25.0 |
| Slovakia | 23.1 | 13.0 | 3.6 | 16.0 | 4.6 | 3.0 | 4.9 | 15.5 |
| Slovenia | 24.4 | 11.9 | 3.1 | 18.4 | 7.4 | 4.3 | 7.4 | 16.4 |
| Spain | 20.6 | 11.0 | 2.6 | 17.3 | 6.3 | 3.3 | 6.1 | 15.8 |
| Sweden | 19.8 | 10.2 | 1.9 | 18.1 | 9.7 | 2.0 | 6.3 | 19.1 |
| European Union | 20.9 | 12.1 | 2.9 | 16.7 | 7.0 | 3.9 | 6.9 | 17.1 |