Back to Article
Tables
Download Notebook

Tables

In [1]:
import pandas as pd
import os
In [4]:
path2SP = "/Users/ctoruno/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data"

eu_member_states = [
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia",
    "Finland","France","Germany","Greece","Hungary","Ireland","Italy","Latvia",
    "Lithuania","Luxembourg","Malta","Netherlands","Poland","Portugal","Romania","Slovakia",
    "Slovenia","Spain","Sweden"
]

Table 1: Extracted Data

In [3]:
data_tbl01 = {
    "Country": ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia", "Finland", "France", 
                "Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta", 
                "Netherlands", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden"],
    "Total News Extracted": [46145, 21287, 38118, 37068, 34955, 41415, 11312, 12370, 6647, 64527, 
                             45321, 49504, 18111, 48409, 93858, 5487, 14396, 7894, 10842, 
                             23935, 21434, 29624, 33264, 34874, 10211, 112820, 6417],
    "Date Range": ["Mar 06, 2023 - Mar 07, 2024", "Jun 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 07, 2024",
                   "Jul 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 08, 2024", "Jul 07, 2023 - Mar 08, 2024",
                   "Aug 07, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 07, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 09, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024"]
}

tbl01 = pd.DataFrame(data_tbl01)
total_row = pd.DataFrame([["Total", sum(data_tbl01["Total News Extracted"]), ""]], columns=tbl01.columns)
tbl01_total = pd.concat([tbl01, total_row], ignore_index=True)
In [4]:
In [4]:
tbl01_total.style.hide(axis="index").format({
    'Total News Extracted': '{:,.0f}'
})
Table 1: Total news extracted per country
Country Total News Extracted Date Range
Austria 46,145 Mar 06, 2023 - Mar 07, 2024
Belgium 21,287 Jun 07, 2023 - Mar 07, 2024
Bulgaria 38,118 Jul 07, 2023 - Mar 07, 2024
Croatia 37,068 Jul 07, 2023 - Mar 07, 2024
Cyprus 34,955 Jul 07, 2023 - Mar 08, 2024
Czechia 41,415 Jul 07, 2023 - Mar 08, 2024
Denmark 11,312 Aug 07, 2023 - Mar 07, 2024
Estonia 12,370 Aug 08, 2023 - Mar 07, 2024
Finland 6,647 Aug 08, 2023 - Mar 07, 2024
France 64,527 Aug 07, 2023 - Mar 07, 2024
Germany 45,321 Jan 08, 2024 - Mar 07, 2024
Greece 49,504 Aug 08, 2023 - Mar 07, 2024
Hungary 18,111 Aug 08, 2023 - Mar 07, 2024
Ireland 48,409 Aug 08, 2023 - Mar 07, 2024
Italy 93,858 Aug 08, 2023 - Mar 07, 2024
Latvia 5,487 Aug 09, 2023 - Mar 07, 2024
Lithuania 14,396 Aug 08, 2023 - Mar 07, 2024
Luxembourg 7,894 Aug 08, 2023 - Mar 07, 2024
Malta 10,842 Aug 08, 2023 - Mar 07, 2024
Netherlands 23,935 Aug 08, 2023 - Mar 07, 2024
Poland 21,434 Aug 08, 2023 - Mar 07, 2024
Portugal 29,624 Aug 08, 2023 - Mar 07, 2024
Romania 33,264 Aug 08, 2023 - Mar 07, 2024
Slovakia 34,874 Aug 08, 2023 - Mar 07, 2024
Slovenia 10,211 Aug 08, 2023 - Mar 07, 2024
Spain 112,820 Aug 08, 2023 - Mar 07, 2024
Sweden 6,417 Jan 08, 2024 - Mar 07, 2024
Total 880,245

Table 2: Failed Translation

In [5]:
path2exdata = f"{path2SP}/data-extraction-1/data4translation"
extracted_data = [pd.read_parquet(f"{path2exdata}/{country}_tp.parquet.gzip") for country in eu_member_states]
exdata = pd.concat(extracted_data)
In [24]:
path2trdata = f"{path2SP}/data-extraction-1/ready4class"
files = os.listdir(path2trdata)
translated_data = [pd.read_parquet(f"{path2trdata}/{x}") for x in files]
trdata = pd.concat(translated_data)
In [45]:
In [45]:
tbl02 = (
    exdata.country.value_counts().reset_index().sort_values("country")
    .merge(
        trdata.country.value_counts().reset_index(), 
        how = "left", 
        on  = "country"
    )
    .assign(
        success = lambda df: (1-((df['count_x'] - df['count_y']) / df['count_x']))*100
    )
    .rename(
        columns = {
            "country": "Country",
            "count_x": "Extracted News (n)",
            "count_y": "Translated News (n)",
            "success": "Translated News (%)"
        },
        # inplace = True
    )
)

total_row = pd.DataFrame(
    [[
        "European Union", 
        tbl02["Extracted News (n)"].sum(),
        tbl02["Translated News (n)"].sum(),
        tbl02["Translated News (%)"].mean(),
    ]], 
    columns=tbl02.columns
)
tbl02_total = pd.concat([tbl02, total_row], ignore_index=True)

tbl02_total.style.hide(axis="index").format({
    "Extracted News (n)": "{:,.0f}",
    "Translated News (n)": "{:,.0f}",
    "Translated News (%)": "{:,.1f}"
})
Table 2: Translation Process (Rate of Success)
Country Extracted News (n) Translated News (n) Translated News (%)
Austria 46,145 44,181 95.7
Belgium 21,287 19,846 93.2
Bulgaria 38,118 33,070 86.8
Croatia 37,068 36,595 98.7
Cyprus 34,955 33,772 96.6
Czechia 41,415 40,620 98.1
Denmark 11,312 10,761 95.1
Estonia 12,370 11,334 91.6
Finland 6,647 6,399 96.3
France 64,527 62,900 97.5
Germany 45,321 40,444 89.2
Greece 49,504 47,182 95.3
Hungary 18,111 17,965 99.2
Ireland 48,409 48,409 100.0
Italy 93,858 82,885 88.3
Latvia 5,487 5,467 99.6
Lithuania 14,396 13,287 92.3
Luxembourg 7,894 7,636 96.7
Malta 10,842 10,556 97.4
Netherlands 23,935 22,642 94.6
Poland 21,434 17,490 81.6
Portugal 29,624 29,416 99.3
Romania 33,264 32,234 96.9
Slovakia 34,874 28,973 83.1
Slovenia 10,211 9,818 96.2
Spain 112,820 88,324 78.3
Sweden 6,417 6,223 97.0
European Union 880,245 808,429 93.9

Table 3: First Stage Classification

In [5]:
path2cldata = f"{path2SP}/data-classification-1/0_compiled"

mutate_dict = {
    "Unrelated"      : 0,
    "Related"        : 1,
    "Justice"        : 1,
    "Governance"     : 1,
    "Rule of Law"    : 1,
    "Skipped article": 2
}
In [6]:
cldata = [pd.read_parquet(f"{path2cldata}/{country}_classified.parquet.gzip") for country in eu_member_states]
cldata_master = pd.concat(cldata)
In [75]:
data = []
for country in eu_member_states:
    file = f"{path2cldata}/{country}_classified.parquet.gzip"
    df = pd.read_parquet(file)
    df["topic_related"] = df["topic_related"].str.strip()
    df["relation"] = df["topic_related"].replace(mutate_dict)

    if country == "Czechia":
        location_column = "location_Czech"
    else:
        location_column = f"location_{country}"

    df["related_within"] = df.apply(lambda row: row[location_column] and row["relation"] == 1, axis=1)

    total   = len(df)
    related = (df.relation.value_counts()[1] / total)*100
    within  = (df.related_within.value_counts()[True] / total)*100
    within_ = (df.related_within.value_counts()[True])
    skipped = (df.relation.value_counts()[2] / total)*100

    dict = {
        "Country": country,
        "Related (%)" : related,
        "Related - Within (%)": within,
        "Related - Within (n)": within_,
        "Unclassified (%)": skipped
    }

    data.append(dict)
In [84]:
tbl03 = pd.DataFrame(data)
total_row = pd.DataFrame(
    [[
        "European Union", 
        tbl02["Related (%)"].mean(),
        tbl02["Related - Within (%)"].mean(),
        tbl02["Related - Within (n)"].sum(),
        tbl02["Unclassified (%)"].mean(),
    ]], 
    columns=tbl03.columns
)
tbl03_total = pd.concat([tbl03, total_row], ignore_index=True)
In [85]:
In [85]:
tbl03_total.style.hide(axis="index").format({
    "Related (%)": "{:,.1f}",
    "Related - Within (%)": "{:,.1f}",
    "Related - Within (n)": "{:,.0f}",
    "Unclassified (%)": "{:,.1f}"
})
Table 3: Broad classification results
Country Related (%) Related - Within (%) Related - Within (n) Unclassified (%)
Austria 24.9 12.7 5,590 0.5
Belgium 24.2 13.8 2,737 1.0
Bulgaria 35.1 24.4 8,071 0.3
Croatia 28.8 17.6 6,431 0.8
Cyprus 29.1 19.9 6,716 0.3
Czechia 21.8 17.5 7,114 1.3
Denmark 30.6 17.6 1,897 0.9
Estonia 22.5 14.1 1,599 0.3
Finland 32.8 23.5 1,501 0.8
France 26.5 15.8 9,955 0.7
Germany 29.4 23.0 9,292 0.8
Greece 26.6 17.1 8,027 1.2
Hungary 25.8 15.4 2,768 0.8
Ireland 8.0 5.5 2,676 0.2
Italy 29.1 24.1 18,800 0.8
Latvia 15.1 11.6 636 0.6
Lithuania 31.5 24.6 3,272 0.7
Luxembourg 26.7 14.7 1,119 0.5
Malta 37.6 30.4 3,205 0.4
Netherlands 24.8 14.2 3,223 0.6
Poland 40.7 33.7 5,902 0.8
Portugal 23.2 14.8 4,217 0.3
Romania 43.6 27.2 8,765 1.0
Slovakia 26.8 17.9 5,172 0.4
Slovenia 30.7 18.2 1,786 0.7
Spain 27.7 18.9 16,685 1.0
Sweden 26.7 15.6 968 2.5
European Union 27.8 18.7 148,124 0.8

Table 4: Second Stage Classification

In [27]:
tbl04_data = (
    cldata_master.copy()
    .drop_duplicates(subset = "id")
    .loc[:,["country", "pillar_1", "pillar_2", "pillar_3", "pillar_4", "pillar_5", "pillar_6", "pillar_7", "pillar_8"]]
    .groupby("country")
    .agg("mean")
    .apply(lambda x: x*100)
    .reset_index()
    .rename(
        columns = {
            "country" : "Country",
            "pillar_1": "Pillar 1",
            "pillar_2": "Pillar 2",
            "pillar_3": "Pillar 3",
            "pillar_4": "Pillar 4",
            "pillar_5": "Pillar 5",
            "pillar_6": "Pillar 6",
            "pillar_7": "Pillar 7",
            "pillar_8": "Pillar 8",
        }
    )
)
total_row = pd.DataFrame(
    [["European Union"] + tbl04_data.iloc[:, 1:].mean().values.tolist()], 
    columns=tbl04_data.columns
)
tbl04_total = pd.concat([tbl04_data, total_row], ignore_index=True)
In [28]:
In [28]:
tbl04_total.style.hide(axis="index").format(precision=1)
Table 4: Pillar classification results (%)
Country Pillar 1 Pillar 2 Pillar 3 Pillar 4 Pillar 5 Pillar 6 Pillar 7 Pillar 8
Austria 18.9 9.8 2.2 15.3 5.8 2.5 6.1 15.8
Belgium 13.3 9.0 1.4 12.8 5.8 1.9 6.5 17.1
Bulgaria 27.4 16.7 3.6 17.8 8.1 5.2 9.2 21.3
Croatia 20.6 13.0 2.3 15.6 8.1 3.0 6.5 19.8
Cyprus 22.6 13.7 3.9 16.2 7.5 6.1 7.9 15.6
Czechia 14.7 9.1 1.7 10.8 6.6 2.7 4.6 14.6
Denmark 24.3 12.8 3.0 20.2 9.2 3.0 8.2 20.5
Estonia 16.9 8.9 2.0 11.7 4.3 3.4 5.9 13.2
Finland 24.2 12.9 1.9 20.3 10.2 3.6 8.2 22.5
France 20.5 9.7 2.3 18.5 7.1 3.1 5.6 15.3
Germany 20.9 9.5 2.1 17.7 9.2 2.5 7.0 19.4
Greece 20.2 12.1 2.6 16.7 8.6 3.9 6.3 17.4
Hungary 21.8 13.2 2.6 16.4 5.9 3.6 4.0 14.0
Ireland 18.6 9.4 2.7 16.5 7.0 3.7 8.0 16.2
Italy 17.9 12.3 2.5 17.2 8.2 4.1 7.4 18.4
Latvia 10.4 6.1 2.0 7.5 3.1 3.3 2.1 6.2
Lithuania 22.7 14.1 2.4 15.3 6.6 4.4 7.7 20.7
Luxembourg 16.4 8.3 3.8 15.8 6.6 3.7 5.8 13.1
Malta 30.1 19.3 7.4 24.0 6.4 9.2 13.0 21.4
Netherlands 16.6 9.6 2.6 15.5 5.6 3.0 6.0 14.8
Poland 31.6 18.1 3.8 23.1 7.6 5.6 11.4 23.5
Portugal 18.6 9.5 3.0 13.6 3.1 3.6 5.2 10.6
Romania 28.6 23.3 4.1 22.5 9.5 7.0 8.1 25.0
Slovakia 23.1 13.0 3.6 16.0 4.6 3.0 4.9 15.5
Slovenia 24.4 11.9 3.1 18.4 7.4 4.3 7.4 16.4
Spain 20.6 11.0 2.6 17.3 6.3 3.3 6.1 15.8
Sweden 19.8 10.2 1.9 18.1 9.7 2.0 6.3 19.1
European Union 20.9 12.1 2.9 16.7 7.0 3.9 6.9 17.1