Tables

In [1]:

import pandas as pd
import os

In [4]:

path2SP = "/Users/ctoruno/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data"

eu_member_states = [
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czechia","Denmark","Estonia",
    "Finland","France","Germany","Greece","Hungary","Ireland","Italy","Latvia",
    "Lithuania","Luxembourg","Malta","Netherlands","Poland","Portugal","Romania","Slovakia",
    "Slovenia","Spain","Sweden"
]

Table 1: Extracted Data

In [3]:

data_tbl01 = {
    "Country": ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia", "Finland", "France", 
                "Germany", "Greece", "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta", 
                "Netherlands", "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden"],
    "Total News Extracted": [46145, 21287, 38118, 37068, 34955, 41415, 11312, 12370, 6647, 64527, 
                             45321, 49504, 18111, 48409, 93858, 5487, 14396, 7894, 10842, 
                             23935, 21434, 29624, 33264, 34874, 10211, 112820, 6417],
    "Date Range": ["Mar 06, 2023 - Mar 07, 2024", "Jun 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 07, 2024",
                   "Jul 07, 2023 - Mar 07, 2024", "Jul 07, 2023 - Mar 08, 2024", "Jul 07, 2023 - Mar 08, 2024",
                   "Aug 07, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 07, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 09, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024",
                   "Aug 08, 2023 - Mar 07, 2024", "Aug 08, 2023 - Mar 07, 2024", "Jan 08, 2024 - Mar 07, 2024"]
}

tbl01 = pd.DataFrame(data_tbl01)
total_row = pd.DataFrame([["Total", sum(data_tbl01["Total News Extracted"]), ""]], columns=tbl01.columns)
tbl01_total = pd.concat([tbl01, total_row], ignore_index=True)

In [4]:

In [4]:

tbl01_total.style.hide(axis="index").format({
    'Total News Extracted': '{:,.0f}'
})

Table 1: Total news extracted per country

Country	Total News Extracted	Date Range
Austria	46,145	Mar 06, 2023 - Mar 07, 2024
Belgium	21,287	Jun 07, 2023 - Mar 07, 2024
Bulgaria	38,118	Jul 07, 2023 - Mar 07, 2024
Croatia	37,068	Jul 07, 2023 - Mar 07, 2024
Cyprus	34,955	Jul 07, 2023 - Mar 08, 2024
Czechia	41,415	Jul 07, 2023 - Mar 08, 2024
Denmark	11,312	Aug 07, 2023 - Mar 07, 2024
Estonia	12,370	Aug 08, 2023 - Mar 07, 2024
Finland	6,647	Aug 08, 2023 - Mar 07, 2024
France	64,527	Aug 07, 2023 - Mar 07, 2024
Germany	45,321	Jan 08, 2024 - Mar 07, 2024
Greece	49,504	Aug 08, 2023 - Mar 07, 2024
Hungary	18,111	Aug 08, 2023 - Mar 07, 2024
Ireland	48,409	Aug 08, 2023 - Mar 07, 2024
Italy	93,858	Aug 08, 2023 - Mar 07, 2024
Latvia	5,487	Aug 09, 2023 - Mar 07, 2024
Lithuania	14,396	Aug 08, 2023 - Mar 07, 2024
Luxembourg	7,894	Aug 08, 2023 - Mar 07, 2024
Malta	10,842	Aug 08, 2023 - Mar 07, 2024
Netherlands	23,935	Aug 08, 2023 - Mar 07, 2024
Poland	21,434	Aug 08, 2023 - Mar 07, 2024
Portugal	29,624	Aug 08, 2023 - Mar 07, 2024
Romania	33,264	Aug 08, 2023 - Mar 07, 2024
Slovakia	34,874	Aug 08, 2023 - Mar 07, 2024
Slovenia	10,211	Aug 08, 2023 - Mar 07, 2024
Spain	112,820	Aug 08, 2023 - Mar 07, 2024
Sweden	6,417	Jan 08, 2024 - Mar 07, 2024
Total	880,245

Table 2: Failed Translation

In [5]:

path2exdata = f"{path2SP}/data-extraction-1/data4translation"
extracted_data = [pd.read_parquet(f"{path2exdata}/{country}_tp.parquet.gzip") for country in eu_member_states]
exdata = pd.concat(extracted_data)

In [24]:

path2trdata = f"{path2SP}/data-extraction-1/ready4class"
files = os.listdir(path2trdata)
translated_data = [pd.read_parquet(f"{path2trdata}/{x}") for x in files]
trdata = pd.concat(translated_data)

In [45]:

In [45]:

tbl02 = (
    exdata.country.value_counts().reset_index().sort_values("country")
    .merge(
        trdata.country.value_counts().reset_index(), 
        how = "left", 
        on  = "country"
    )
    .assign(
        success = lambda df: (1-((df['count_x'] - df['count_y']) / df['count_x']))*100
    )
    .rename(
        columns = {
            "country": "Country",
            "count_x": "Extracted News (n)",
            "count_y": "Translated News (n)",
            "success": "Translated News (%)"
        },
        # inplace = True
    )
)

total_row = pd.DataFrame(
    [[
        "European Union", 
        tbl02["Extracted News (n)"].sum(),
        tbl02["Translated News (n)"].sum(),
        tbl02["Translated News (%)"].mean(),
    ]], 
    columns=tbl02.columns
)
tbl02_total = pd.concat([tbl02, total_row], ignore_index=True)

tbl02_total.style.hide(axis="index").format({
    "Extracted News (n)": "{:,.0f}",
    "Translated News (n)": "{:,.0f}",
    "Translated News (%)": "{:,.1f}"
})

Table 2: Translation Process (Rate of Success)

Country	Extracted News (n)	Translated News (n)	Translated News (%)
Austria	46,145	44,181	95.7
Belgium	21,287	19,846	93.2
Bulgaria	38,118	33,070	86.8
Croatia	37,068	36,595	98.7
Cyprus	34,955	33,772	96.6
Czechia	41,415	40,620	98.1
Denmark	11,312	10,761	95.1
Estonia	12,370	11,334	91.6
Finland	6,647	6,399	96.3
France	64,527	62,900	97.5
Germany	45,321	40,444	89.2
Greece	49,504	47,182	95.3
Hungary	18,111	17,965	99.2
Ireland	48,409	48,409	100.0
Italy	93,858	82,885	88.3
Latvia	5,487	5,467	99.6
Lithuania	14,396	13,287	92.3
Luxembourg	7,894	7,636	96.7
Malta	10,842	10,556	97.4
Netherlands	23,935	22,642	94.6
Poland	21,434	17,490	81.6
Portugal	29,624	29,416	99.3
Romania	33,264	32,234	96.9
Slovakia	34,874	28,973	83.1
Slovenia	10,211	9,818	96.2
Spain	112,820	88,324	78.3
Sweden	6,417	6,223	97.0
European Union	880,245	808,429	93.9

Table 3: First Stage Classification

In [5]:

path2cldata = f"{path2SP}/data-classification-1/0_compiled"

mutate_dict = {
    "Unrelated"      : 0,
    "Related"        : 1,
    "Justice"        : 1,
    "Governance"     : 1,
    "Rule of Law"    : 1,
    "Skipped article": 2
}

In [6]:

cldata = [pd.read_parquet(f"{path2cldata}/{country}_classified.parquet.gzip") for country in eu_member_states]
cldata_master = pd.concat(cldata)

In [75]:

data = []
for country in eu_member_states:
    file = f"{path2cldata}/{country}_classified.parquet.gzip"
    df = pd.read_parquet(file)
    df["topic_related"] = df["topic_related"].str.strip()
    df["relation"] = df["topic_related"].replace(mutate_dict)

    if country == "Czechia":
        location_column = "location_Czech"
    else:
        location_column = f"location_{country}"

    df["related_within"] = df.apply(lambda row: row[location_column] and row["relation"] == 1, axis=1)

    total   = len(df)
    related = (df.relation.value_counts()[1] / total)*100
    within  = (df.related_within.value_counts()[True] / total)*100
    within_ = (df.related_within.value_counts()[True])
    skipped = (df.relation.value_counts()[2] / total)*100

    dict = {
        "Country": country,
        "Related (%)" : related,
        "Related - Within (%)": within,
        "Related - Within (n)": within_,
        "Unclassified (%)": skipped
    }

    data.append(dict)

In [84]:

tbl03 = pd.DataFrame(data)
total_row = pd.DataFrame(
    [[
        "European Union", 
        tbl02["Related (%)"].mean(),
        tbl02["Related - Within (%)"].mean(),
        tbl02["Related - Within (n)"].sum(),
        tbl02["Unclassified (%)"].mean(),
    ]], 
    columns=tbl03.columns
)
tbl03_total = pd.concat([tbl03, total_row], ignore_index=True)

In [85]:

In [85]:

tbl03_total.style.hide(axis="index").format({
    "Related (%)": "{:,.1f}",
    "Related - Within (%)": "{:,.1f}",
    "Related - Within (n)": "{:,.0f}",
    "Unclassified (%)": "{:,.1f}"
})

Table 3: Broad classification results

Country	Related (%)	Related - Within (%)	Related - Within (n)	Unclassified (%)
Austria	24.9	12.7	5,590	0.5
Belgium	24.2	13.8	2,737	1.0
Bulgaria	35.1	24.4	8,071	0.3
Croatia	28.8	17.6	6,431	0.8
Cyprus	29.1	19.9	6,716	0.3
Czechia	21.8	17.5	7,114	1.3
Denmark	30.6	17.6	1,897	0.9
Estonia	22.5	14.1	1,599	0.3
Finland	32.8	23.5	1,501	0.8
France	26.5	15.8	9,955	0.7
Germany	29.4	23.0	9,292	0.8
Greece	26.6	17.1	8,027	1.2
Hungary	25.8	15.4	2,768	0.8
Ireland	8.0	5.5	2,676	0.2
Italy	29.1	24.1	18,800	0.8
Latvia	15.1	11.6	636	0.6
Lithuania	31.5	24.6	3,272	0.7
Luxembourg	26.7	14.7	1,119	0.5
Malta	37.6	30.4	3,205	0.4
Netherlands	24.8	14.2	3,223	0.6
Poland	40.7	33.7	5,902	0.8
Portugal	23.2	14.8	4,217	0.3
Romania	43.6	27.2	8,765	1.0
Slovakia	26.8	17.9	5,172	0.4
Slovenia	30.7	18.2	1,786	0.7
Spain	27.7	18.9	16,685	1.0
Sweden	26.7	15.6	968	2.5
European Union	27.8	18.7	148,124	0.8

Table 4: Second Stage Classification

In [27]:

tbl04_data = (
    cldata_master.copy()
    .drop_duplicates(subset = "id")
    .loc[:,["country", "pillar_1", "pillar_2", "pillar_3", "pillar_4", "pillar_5", "pillar_6", "pillar_7", "pillar_8"]]
    .groupby("country")
    .agg("mean")
    .apply(lambda x: x*100)
    .reset_index()
    .rename(
        columns = {
            "country" : "Country",
            "pillar_1": "Pillar 1",
            "pillar_2": "Pillar 2",
            "pillar_3": "Pillar 3",
            "pillar_4": "Pillar 4",
            "pillar_5": "Pillar 5",
            "pillar_6": "Pillar 6",
            "pillar_7": "Pillar 7",
            "pillar_8": "Pillar 8",
        }
    )
)
total_row = pd.DataFrame(
    [["European Union"] + tbl04_data.iloc[:, 1:].mean().values.tolist()], 
    columns=tbl04_data.columns
)
tbl04_total = pd.concat([tbl04_data, total_row], ignore_index=True)

In [28]:

In [28]:

tbl04_total.style.hide(axis="index").format(precision=1)

Table 4: Pillar classification results (%)

Country	Pillar 1	Pillar 2	Pillar 3	Pillar 4	Pillar 5	Pillar 6	Pillar 7	Pillar 8
Austria	18.9	9.8	2.2	15.3	5.8	2.5	6.1	15.8
Belgium	13.3	9.0	1.4	12.8	5.8	1.9	6.5	17.1
Bulgaria	27.4	16.7	3.6	17.8	8.1	5.2	9.2	21.3
Croatia	20.6	13.0	2.3	15.6	8.1	3.0	6.5	19.8
Cyprus	22.6	13.7	3.9	16.2	7.5	6.1	7.9	15.6
Czechia	14.7	9.1	1.7	10.8	6.6	2.7	4.6	14.6
Denmark	24.3	12.8	3.0	20.2	9.2	3.0	8.2	20.5
Estonia	16.9	8.9	2.0	11.7	4.3	3.4	5.9	13.2
Finland	24.2	12.9	1.9	20.3	10.2	3.6	8.2	22.5
France	20.5	9.7	2.3	18.5	7.1	3.1	5.6	15.3
Germany	20.9	9.5	2.1	17.7	9.2	2.5	7.0	19.4
Greece	20.2	12.1	2.6	16.7	8.6	3.9	6.3	17.4
Hungary	21.8	13.2	2.6	16.4	5.9	3.6	4.0	14.0
Ireland	18.6	9.4	2.7	16.5	7.0	3.7	8.0	16.2
Italy	17.9	12.3	2.5	17.2	8.2	4.1	7.4	18.4
Latvia	10.4	6.1	2.0	7.5	3.1	3.3	2.1	6.2
Lithuania	22.7	14.1	2.4	15.3	6.6	4.4	7.7	20.7
Luxembourg	16.4	8.3	3.8	15.8	6.6	3.7	5.8	13.1
Malta	30.1	19.3	7.4	24.0	6.4	9.2	13.0	21.4
Netherlands	16.6	9.6	2.6	15.5	5.6	3.0	6.0	14.8
Poland	31.6	18.1	3.8	23.1	7.6	5.6	11.4	23.5
Portugal	18.6	9.5	3.0	13.6	3.1	3.6	5.2	10.6
Romania	28.6	23.3	4.1	22.5	9.5	7.0	8.1	25.0
Slovakia	23.1	13.0	3.6	16.0	4.6	3.0	4.9	15.5
Slovenia	24.4	11.9	3.1	18.4	7.4	4.3	7.4	16.4
Spain	20.6	11.0	2.6	17.3	6.3	3.3	6.1	15.8
Sweden	19.8	10.2	1.9	18.1	9.7	2.0	6.3	19.1
European Union	20.9	12.1	2.9	16.7	7.0	3.9	6.9	17.1