import pandas as pd
import os
In [1]:
In [4]:
= "/Users/ctoruno/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data"
path2SP
= [
eu_member_states "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia",
"Finland","France","Germany","Greece","Hungary","Ireland","Italy","Latvia",
"Lithuania","Luxembourg","Malta","Netherlands","Poland","Portugal","Romania","Slovakia",
"Slovenia","Spain","Sweden"
]
Table 1: Extracted Data
In [3]:
= {
data_tbl01 "Country": ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia", "Finland", "France",
"Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta",
"Netherlands", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden"],
"Total News Extracted": [46145, 21287, 38118, 37068, 34955, 41415, 11312, 12370, 6647, 64527,
45321, 49504, 18111, 48409, 93858, 5487, 14396, 7894, 10842,
23935, 21434, 29624, 33264, 34874, 10211, 112820, 6417],
"Date Range": ["Mar 06, 2023 - Mar 07, 2024", "Jun 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 07, 2024",
"Jul 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 08, 2024", "Jul 07, 2023 - Mar 08, 2024",
"Aug 07, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 07, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 09, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
"Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024"]
}
= pd.DataFrame(data_tbl01)
tbl01 = pd.DataFrame([["Total", sum(data_tbl01["Total News Extracted"]), ""]], columns=tbl01.columns)
total_row = pd.concat([tbl01, total_row], ignore_index=True) tbl01_total
In [4]:
In [4]:
="index").format({
tbl01_total.style.hide(axis'Total News Extracted': '{:,.0f}'
})
Country | Total News Extracted | Date Range |
---|---|---|
Austria | 46,145 | Mar 06, 2023 - Mar 07, 2024 |
Belgium | 21,287 | Jun 07, 2023 - Mar 07, 2024 |
Bulgaria | 38,118 | Jul 07, 2023 - Mar 07, 2024 |
Croatia | 37,068 | Jul 07, 2023 - Mar 07, 2024 |
Cyprus | 34,955 | Jul 07, 2023 - Mar 08, 2024 |
Czechia | 41,415 | Jul 07, 2023 - Mar 08, 2024 |
Denmark | 11,312 | Aug 07, 2023 - Mar 07, 2024 |
Estonia | 12,370 | Aug 08, 2023 - Mar 07, 2024 |
Finland | 6,647 | Aug 08, 2023 - Mar 07, 2024 |
France | 64,527 | Aug 07, 2023 - Mar 07, 2024 |
Germany | 45,321 | Jan 08, 2024 - Mar 07, 2024 |
Greece | 49,504 | Aug 08, 2023 - Mar 07, 2024 |
Hungary | 18,111 | Aug 08, 2023 - Mar 07, 2024 |
Ireland | 48,409 | Aug 08, 2023 - Mar 07, 2024 |
Italy | 93,858 | Aug 08, 2023 - Mar 07, 2024 |
Latvia | 5,487 | Aug 09, 2023 - Mar 07, 2024 |
Lithuania | 14,396 | Aug 08, 2023 - Mar 07, 2024 |
Luxembourg | 7,894 | Aug 08, 2023 - Mar 07, 2024 |
Malta | 10,842 | Aug 08, 2023 - Mar 07, 2024 |
Netherlands | 23,935 | Aug 08, 2023 - Mar 07, 2024 |
Poland | 21,434 | Aug 08, 2023 - Mar 07, 2024 |
Portugal | 29,624 | Aug 08, 2023 - Mar 07, 2024 |
Romania | 33,264 | Aug 08, 2023 - Mar 07, 2024 |
Slovakia | 34,874 | Aug 08, 2023 - Mar 07, 2024 |
Slovenia | 10,211 | Aug 08, 2023 - Mar 07, 2024 |
Spain | 112,820 | Aug 08, 2023 - Mar 07, 2024 |
Sweden | 6,417 | Jan 08, 2024 - Mar 07, 2024 |
Total | 880,245 |
Table 2: Failed Translation
In [5]:
= f"{path2SP}/data-extraction-1/data4translation"
path2exdata = [pd.read_parquet(f"{path2exdata}/{country}_tp.parquet.gzip") for country in eu_member_states]
extracted_data = pd.concat(extracted_data) exdata
In [24]:
= f"{path2SP}/data-extraction-1/ready4class"
path2trdata = os.listdir(path2trdata)
files = [pd.read_parquet(f"{path2trdata}/{x}") for x in files]
translated_data = pd.concat(translated_data) trdata
In [45]:
In [45]:
= (
tbl02 "country")
exdata.country.value_counts().reset_index().sort_values(
.merge(
trdata.country.value_counts().reset_index(), = "left",
how = "country"
on
)
.assign(= lambda df: (1-((df['count_x'] - df['count_y']) / df['count_x']))*100
success
)
.rename(= {
columns "country": "Country",
"count_x": "Extracted News (n)",
"count_y": "Translated News (n)",
"success": "Translated News (%)"
},# inplace = True
)
)
= pd.DataFrame(
total_row
[["European Union",
"Extracted News (n)"].sum(),
tbl02["Translated News (n)"].sum(),
tbl02["Translated News (%)"].mean(),
tbl02[
]], =tbl02.columns
columns
)= pd.concat([tbl02, total_row], ignore_index=True)
tbl02_total
="index").format({
tbl02_total.style.hide(axis"Extracted News (n)": "{:,.0f}",
"Translated News (n)": "{:,.0f}",
"Translated News (%)": "{:,.1f}"
})
Country | Extracted News (n) | Translated News (n) | Translated News (%) |
---|---|---|---|
Austria | 46,145 | 44,181 | 95.7 |
Belgium | 21,287 | 19,846 | 93.2 |
Bulgaria | 38,118 | 33,070 | 86.8 |
Croatia | 37,068 | 36,595 | 98.7 |
Cyprus | 34,955 | 33,772 | 96.6 |
Czechia | 41,415 | 40,620 | 98.1 |
Denmark | 11,312 | 10,761 | 95.1 |
Estonia | 12,370 | 11,334 | 91.6 |
Finland | 6,647 | 6,399 | 96.3 |
France | 64,527 | 62,900 | 97.5 |
Germany | 45,321 | 40,444 | 89.2 |
Greece | 49,504 | 47,182 | 95.3 |
Hungary | 18,111 | 17,965 | 99.2 |
Ireland | 48,409 | 48,409 | 100.0 |
Italy | 93,858 | 82,885 | 88.3 |
Latvia | 5,487 | 5,467 | 99.6 |
Lithuania | 14,396 | 13,287 | 92.3 |
Luxembourg | 7,894 | 7,636 | 96.7 |
Malta | 10,842 | 10,556 | 97.4 |
Netherlands | 23,935 | 22,642 | 94.6 |
Poland | 21,434 | 17,490 | 81.6 |
Portugal | 29,624 | 29,416 | 99.3 |
Romania | 33,264 | 32,234 | 96.9 |
Slovakia | 34,874 | 28,973 | 83.1 |
Slovenia | 10,211 | 9,818 | 96.2 |
Spain | 112,820 | 88,324 | 78.3 |
Sweden | 6,417 | 6,223 | 97.0 |
European Union | 880,245 | 808,429 | 93.9 |
Table 3: First Stage Classification
In [5]:
= f"{path2SP}/data-classification-1/0_compiled"
path2cldata
= {
mutate_dict "Unrelated" : 0,
"Related" : 1,
"Justice" : 1,
"Governance" : 1,
"Rule of Law" : 1,
"Skipped article": 2
}
In [6]:
= [pd.read_parquet(f"{path2cldata}/{country}_classified.parquet.gzip") for country in eu_member_states]
cldata = pd.concat(cldata) cldata_master
In [75]:
= []
data for country in eu_member_states:
file = f"{path2cldata}/{country}_classified.parquet.gzip"
= pd.read_parquet(file)
df "topic_related"] = df["topic_related"].str.strip()
df["relation"] = df["topic_related"].replace(mutate_dict)
df[
if country == "Czechia":
= "location_Czech"
location_column else:
= f"location_{country}"
location_column
"related_within"] = df.apply(lambda row: row[location_column] and row["relation"] == 1, axis=1)
df[
= len(df)
total = (df.relation.value_counts()[1] / total)*100
related = (df.related_within.value_counts()[True] / total)*100
within = (df.related_within.value_counts()[True])
within_ = (df.relation.value_counts()[2] / total)*100
skipped
dict = {
"Country": country,
"Related (%)" : related,
"Related - Within (%)": within,
"Related - Within (n)": within_,
"Unclassified (%)": skipped
}
dict) data.append(
In [84]:
= pd.DataFrame(data)
tbl03 = pd.DataFrame(
total_row
[["European Union",
"Related (%)"].mean(),
tbl02["Related - Within (%)"].mean(),
tbl02["Related - Within (n)"].sum(),
tbl02["Unclassified (%)"].mean(),
tbl02[
]], =tbl03.columns
columns
)= pd.concat([tbl03, total_row], ignore_index=True) tbl03_total
In [85]:
In [85]:
="index").format({
tbl03_total.style.hide(axis"Related (%)": "{:,.1f}",
"Related - Within (%)": "{:,.1f}",
"Related - Within (n)": "{:,.0f}",
"Unclassified (%)": "{:,.1f}"
})
Country | Related (%) | Related - Within (%) | Related - Within (n) | Unclassified (%) |
---|---|---|---|---|
Austria | 24.9 | 12.7 | 5,590 | 0.5 |
Belgium | 24.2 | 13.8 | 2,737 | 1.0 |
Bulgaria | 35.1 | 24.4 | 8,071 | 0.3 |
Croatia | 28.8 | 17.6 | 6,431 | 0.8 |
Cyprus | 29.1 | 19.9 | 6,716 | 0.3 |
Czechia | 21.8 | 17.5 | 7,114 | 1.3 |
Denmark | 30.6 | 17.6 | 1,897 | 0.9 |
Estonia | 22.5 | 14.1 | 1,599 | 0.3 |
Finland | 32.8 | 23.5 | 1,501 | 0.8 |
France | 26.5 | 15.8 | 9,955 | 0.7 |
Germany | 29.4 | 23.0 | 9,292 | 0.8 |
Greece | 26.6 | 17.1 | 8,027 | 1.2 |
Hungary | 25.8 | 15.4 | 2,768 | 0.8 |
Ireland | 8.0 | 5.5 | 2,676 | 0.2 |
Italy | 29.1 | 24.1 | 18,800 | 0.8 |
Latvia | 15.1 | 11.6 | 636 | 0.6 |
Lithuania | 31.5 | 24.6 | 3,272 | 0.7 |
Luxembourg | 26.7 | 14.7 | 1,119 | 0.5 |
Malta | 37.6 | 30.4 | 3,205 | 0.4 |
Netherlands | 24.8 | 14.2 | 3,223 | 0.6 |
Poland | 40.7 | 33.7 | 5,902 | 0.8 |
Portugal | 23.2 | 14.8 | 4,217 | 0.3 |
Romania | 43.6 | 27.2 | 8,765 | 1.0 |
Slovakia | 26.8 | 17.9 | 5,172 | 0.4 |
Slovenia | 30.7 | 18.2 | 1,786 | 0.7 |
Spain | 27.7 | 18.9 | 16,685 | 1.0 |
Sweden | 26.7 | 15.6 | 968 | 2.5 |
European Union | 27.8 | 18.7 | 148,124 | 0.8 |
Table 4: Second Stage Classification
In [27]:
= (
tbl04_data
cldata_master.copy()= "id")
.drop_duplicates(subset "country", "pillar_1", "pillar_2", "pillar_3", "pillar_4", "pillar_5", "pillar_6", "pillar_7", "pillar_8"]]
.loc[:,["country")
.groupby("mean")
.agg(apply(lambda x: x*100)
.
.reset_index()
.rename(= {
columns "country" : "Country",
"pillar_1": "Pillar 1",
"pillar_2": "Pillar 2",
"pillar_3": "Pillar 3",
"pillar_4": "Pillar 4",
"pillar_5": "Pillar 5",
"pillar_6": "Pillar 6",
"pillar_7": "Pillar 7",
"pillar_8": "Pillar 8",
}
)
)= pd.DataFrame(
total_row "European Union"] + tbl04_data.iloc[:, 1:].mean().values.tolist()],
[[=tbl04_data.columns
columns
)= pd.concat([tbl04_data, total_row], ignore_index=True) tbl04_total
In [28]:
In [28]:
="index").format(precision=1) tbl04_total.style.hide(axis
Country | Pillar 1 | Pillar 2 | Pillar 3 | Pillar 4 | Pillar 5 | Pillar 6 | Pillar 7 | Pillar 8 |
---|---|---|---|---|---|---|---|---|
Austria | 18.9 | 9.8 | 2.2 | 15.3 | 5.8 | 2.5 | 6.1 | 15.8 |
Belgium | 13.3 | 9.0 | 1.4 | 12.8 | 5.8 | 1.9 | 6.5 | 17.1 |
Bulgaria | 27.4 | 16.7 | 3.6 | 17.8 | 8.1 | 5.2 | 9.2 | 21.3 |
Croatia | 20.6 | 13.0 | 2.3 | 15.6 | 8.1 | 3.0 | 6.5 | 19.8 |
Cyprus | 22.6 | 13.7 | 3.9 | 16.2 | 7.5 | 6.1 | 7.9 | 15.6 |
Czechia | 14.7 | 9.1 | 1.7 | 10.8 | 6.6 | 2.7 | 4.6 | 14.6 |
Denmark | 24.3 | 12.8 | 3.0 | 20.2 | 9.2 | 3.0 | 8.2 | 20.5 |
Estonia | 16.9 | 8.9 | 2.0 | 11.7 | 4.3 | 3.4 | 5.9 | 13.2 |
Finland | 24.2 | 12.9 | 1.9 | 20.3 | 10.2 | 3.6 | 8.2 | 22.5 |
France | 20.5 | 9.7 | 2.3 | 18.5 | 7.1 | 3.1 | 5.6 | 15.3 |
Germany | 20.9 | 9.5 | 2.1 | 17.7 | 9.2 | 2.5 | 7.0 | 19.4 |
Greece | 20.2 | 12.1 | 2.6 | 16.7 | 8.6 | 3.9 | 6.3 | 17.4 |
Hungary | 21.8 | 13.2 | 2.6 | 16.4 | 5.9 | 3.6 | 4.0 | 14.0 |
Ireland | 18.6 | 9.4 | 2.7 | 16.5 | 7.0 | 3.7 | 8.0 | 16.2 |
Italy | 17.9 | 12.3 | 2.5 | 17.2 | 8.2 | 4.1 | 7.4 | 18.4 |
Latvia | 10.4 | 6.1 | 2.0 | 7.5 | 3.1 | 3.3 | 2.1 | 6.2 |
Lithuania | 22.7 | 14.1 | 2.4 | 15.3 | 6.6 | 4.4 | 7.7 | 20.7 |
Luxembourg | 16.4 | 8.3 | 3.8 | 15.8 | 6.6 | 3.7 | 5.8 | 13.1 |
Malta | 30.1 | 19.3 | 7.4 | 24.0 | 6.4 | 9.2 | 13.0 | 21.4 |
Netherlands | 16.6 | 9.6 | 2.6 | 15.5 | 5.6 | 3.0 | 6.0 | 14.8 |
Poland | 31.6 | 18.1 | 3.8 | 23.1 | 7.6 | 5.6 | 11.4 | 23.5 |
Portugal | 18.6 | 9.5 | 3.0 | 13.6 | 3.1 | 3.6 | 5.2 | 10.6 |
Romania | 28.6 | 23.3 | 4.1 | 22.5 | 9.5 | 7.0 | 8.1 | 25.0 |
Slovakia | 23.1 | 13.0 | 3.6 | 16.0 | 4.6 | 3.0 | 4.9 | 15.5 |
Slovenia | 24.4 | 11.9 | 3.1 | 18.4 | 7.4 | 4.3 | 7.4 | 16.4 |
Spain | 20.6 | 11.0 | 2.6 | 17.3 | 6.3 | 3.3 | 6.1 | 15.8 |
Sweden | 19.8 | 10.2 | 1.9 | 18.1 | 9.7 | 2.0 | 6.3 | 19.1 |
European Union | 20.9 | 12.1 | 2.9 | 16.7 | 7.0 | 3.9 | 6.9 | 17.1 |