Back to Article
3. Classification Performance Assessment
Download Notebook

3. Classification Performance Assessment

In [111]:
import pandas as pd
import numpy as np
import math
import re
import getpass

Loading data

In [112]:
AI_data = pd.read_parquet(f"/Users/{getpass.getuser()}/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data/GPT-vs-Gemini-data.parquet.gzip")
EU_team = pd.read_parquet(f"/Users/{getpass.getuser()}/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data/human_labelling.parquet.gzip")
EU_team = EU_team[EU_team['bucket'] != "hortiz"]
horacio = pd.read_parquet(f"/Users/{getpass.getuser()}/OneDrive - World Justice Project/EU Subnational/EU-S Data/Automated Qualitative Checks/Data/hortiz.parquet.gzip")

Performance Metrics Function

In [113]:
def calculate_performance(df, predicted_column, truth_column):
    true_pos   = len(df[(df[predicted_column] == 1) & (df[truth_column] == 1)])
    true_neg   = len(df[(df[predicted_column] == 0) & (df[truth_column] == 0)])
    false_neg  = len(df[(df[predicted_column] == 0) & (df[truth_column] == 1)])
    false_pos  = len(df[(df[predicted_column] == 1) & (df[truth_column] == 0)])

    accuracy  = (true_pos + true_neg) / (true_neg + true_pos + false_neg + false_pos) 
    precision = ((true_pos) / (true_pos + false_pos)) if (true_pos + false_pos) > 0 else 0
    recall    = ((true_pos) / (true_pos + false_neg)) if (true_pos + false_neg) > 0 else 0
    f1        = (2*((precision*recall) / (precision+recall))) if (recall != 0) else 0
    tpr       = ((true_pos)/(true_pos + false_neg)) if (true_pos + false_neg) > 0 else 0
    fpr       = ((false_pos)/(false_pos + true_neg)) if (false_pos + true_neg) > 0 else 0 
    c_matrix  = np.array(
        [[true_neg, false_pos],[false_neg, true_pos]]
    )
    mcc       = ((true_pos*true_neg) - (false_pos*false_neg)) / math.sqrt((true_pos+false_pos)*(true_pos+false_neg)*(true_neg+false_pos)*(true_neg+false_neg)) if (
        math.sqrt((true_pos+false_pos)*(true_pos+false_neg)*(true_neg+false_pos)*(true_neg+false_neg))) > 0 else 0

    summary = {
        'accuracy' : accuracy,
        'precision' : precision,
        'recall' : recall,
        'f1' : f1,
        'mcc' : mcc,
        'tpr' : tpr,
        'fpr' : fpr,
        'confusion_matrix' : c_matrix,
        'true_pos' : true_pos,
        'true_neg' : true_neg,
        'false_pos' : false_pos,
        'false_neg' : false_neg
    }

    return summary

Wrangling AI Data

In [114]:
AI_data['horacio_rol'] = (
    AI_data[[
        'horacio_pillar_1', 'horacio_pillar_2', 'horacio_pillar_3', 'horacio_pillar_4',
        'horacio_pillar_5', 'horacio_pillar_6', 'horacio_pillar_7', 'horacio_pillar_8'
    ]].eq(1).any(axis=1)
).astype(int)
AI_data['GPT_ROL'] = (AI_data['factor(s)'] != "Not related to Rule of Law").astype(int)
AI_data['Gemini_ROL'] = (AI_data['gemini_stage_1'] != "Unrelated").astype(int)

Wrangling Human Data

In [115]:
horacio['horacio_ROL'] = np.where((horacio['factor(s)'] != "Not related to Rule of Law"),1,0)
EU_team['model_ROL']   = np.where((EU_team['factor(s)'] != "Not related to Rule of Law"),1,0)

for df in [horacio, EU_team]:
    df['factor(s)'] = df['factor(s)'].astype(str)

pillars = [i for i in range(1,9)]
for pillar in pillars:
    horacio[f'horacio_pillar_{pillar}'] = horacio['factor(s)'].apply(lambda x: int(bool(re.search(f'{pillar}:', str(x)))))
    EU_team[f'model_pillar_{pillar}'] = EU_team['factor(s)'].apply(lambda x: int(bool(re.search(f'{pillar}:', str(x)))))

horacio = horacio.drop(columns = ['link', 'factor(s)', 'sentiment', 'is_eu_related','related_country', 'comments'])
EU_team = EU_team.drop(columns = ['link', 'factor(s)', 'sentiment', 'is_eu_related', 'related_country', 'comments'])
In [116]:
dataframes = {}

for model in EU_team['bucket'].unique():
    subset = EU_team.loc[EU_team['bucket'] == model]
    human_vs_horacio = pd.merge(subset, horacio, on = "article_id", how = "inner")
    dataframes[model] = human_vs_horacio
    n_articles = len(human_vs_horacio)

Stage One Metrics

In [117]:
gpt_metrics = calculate_performance(AI_data, 'GPT_ROL', 'horacio_rol')
print('GPT metrics:', gpt_metrics)
gemini_metrics = calculate_performance(AI_data, 'Gemini_ROL', 'horacio_rol')
print('Gemini metrics: ', gemini_metrics)
GPT metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'mcc': 1.0, 'tpr': 1.0, 'fpr': 0.0, 'confusion_matrix': array([[136,   0],
       [  0,  67]]), 'true_pos': 67, 'true_neg': 136, 'false_pos': 0, 'false_neg': 0}
Gemini metrics:  {'accuracy': 0.7980295566502463, 'precision': 0.782608695652174, 'recall': 0.5373134328358209, 'f1': 0.6371681415929203, 'mcc': 0.5209474243346167, 'tpr': 0.5373134328358209, 'fpr': 0.07352941176470588, 'confusion_matrix': array([[126,  10],
       [ 31,  36]]), 'true_pos': 36, 'true_neg': 126, 'false_pos': 10, 'false_neg': 31}
In [118]:
classification_performance = pd.DataFrame()

for classifier in dataframes:
    metrics = calculate_performance(dataframes[classifier], 'model_ROL', 'horacio_ROL')
    new_row = {
        'classifier': classifier, 
        'accuracy': metrics['accuracy'], 
        'precision': metrics['precision'], 
        'recall': metrics['recall'], 
        'f1': metrics['f1'], 
        'mcc': metrics['mcc'],
        'tpr' : metrics['tpr'],
        'fpr' : metrics['fpr'],
        'confusion_matrix': metrics['confusion_matrix'],  # Store the confusion matrix if needed
        'true_pos' : metrics['true_pos'],
        'true_neg' : metrics['true_neg'],
        'false_pos' : metrics['false_pos'],
        'false_neg' : metrics['false_neg']
    }
    new_row = pd.DataFrame([new_row])
    classification_performance = pd.concat([classification_performance, new_row], ignore_index = True)
In [119]:
stage_one_summary = []
gpt_stage_one = {
    'tpr' : gpt_metrics["tpr"],
    "fpr" : gpt_metrics["fpr"],
    'confusion_matrix' : gpt_metrics["confusion_matrix"],
    'classifier' : 'GPT'
}
gem_stage_one = {
    'tpr' : gemini_metrics["tpr"],
    "fpr" : gemini_metrics["fpr"],
    'confusion_matrix' : gemini_metrics["confusion_matrix"],
    'classifier' : 'Gemini'
}

top = classification_performance.loc[
    (classification_performance['tpr'] > 0.8)
]
top_eu = {
    'tpr' : top['tpr'].mean(),
    'fpr' : top['fpr'].mean(),
    'confusion_matrix' : top['confusion_matrix'].sum(),
    'classifier' : 'Top Human Classifiers'
}
all_of_eu = {
    'tpr' : classification_performance['tpr'].mean(),
    'fpr' : classification_performance['fpr'].mean(),
    'confusion_matrix' : classification_performance['confusion_matrix'].sum(),
    'classifier' : 'Human Classifiers'
}

for i in [gpt_stage_one, gem_stage_one, top_eu, all_of_eu]:
    stage_one_summary.append(i)

stage_one = pd.DataFrame(stage_one_summary)
In [120]:
In [120]:
stage_one[["classifier", "tpr", "fpr", "confusion_matrix"]].rename(
        columns = {
            "classifier"       : "Classifier",
            "tpr"              : "TPR",
            "fpr"              : "FPR",
            "confusion_matrix" : "Confussion Matrix"
        },
        # inplace = True
    ).style.hide(axis="index").format({
    "TPR": "{:,.2f}",
    "FPR": "{:,.2f}"
})
Table 1: Classification Performance Assessment (Stage 1)
Classifier TPR FPR Confussion Matrix
GPT 1.00 0.00 [[136 0] [ 0 67]]
Gemini 0.54 0.07 [[126 10] [ 31 36]]
Top Human Classifiers 0.92 0.14 [[241 43] [ 7 75]]
Human Classifiers 0.75 0.09 [[511 56] [ 42 125]]
In [121]:
better_than_gemini = classification_performance.loc[
    (classification_performance['accuracy'] >= 0.8) &
    (classification_performance['precision'] >= 0.8) &
    (classification_performance['recall'] >= 0.54) &
    (classification_performance['f1'] >= 0.64)
]
p = len(better_than_gemini) / len(classification_performance)
print(f"{p*100:.2f} % of the EU team classified articles better than Gemini ({len(better_than_gemini)} people).")
35.00 % of the EU team classified articles better than Gemini (7 people).

Stage Two Metrics

AI Metrics

In [122]:
AI_data = AI_data[AI_data['horacio_rol'] == 1]

gpt_columns = [
    'GPT_pillar_1','GPT_pillar_2','GPT_pillar_3','GPT_pillar_4',
    'GPT_pillar_5','GPT_pillar_6','GPT_pillar_7','GPT_pillar_8'
]
gemini_columns = [
    'Gemini_pillar_1','Gemini_pillar_2','Gemini_pillar_3','Gemini_pillar_4',
    'Gemini_pillar_5','Gemini_pillar_6','Gemini_pillar_7','Gemini_pillar_8'
]
horacio_columns = [
    'horacio_pillar_1', 'horacio_pillar_2', 'horacio_pillar_3', 'horacio_pillar_4', 
    'horacio_pillar_5', 'horacio_pillar_6', 'horacio_pillar_7', 'horacio_pillar_8'
]

performance_data = []
for horacio, gpt, gemini in zip(horacio_columns, gpt_columns, gemini_columns):
    
    gpt_performance = calculate_performance(AI_data, gpt, horacio)
    gpt_performance['Classifier'] = 'GPT'
    gpt_performance['Model_Column'] = gpt
    gpt_performance['Horacio'] = horacio
    performance_data.append(gpt_performance)
    
    gemini_performance = calculate_performance(AI_data, gemini, horacio)
    gemini_performance['Classifier'] = 'Gemini'
    gemini_performance['Model_Column'] = gemini
    gemini_performance['Horacio'] = horacio
    performance_data.append(gemini_performance)

AI_performance = pd.DataFrame(performance_data)

Human Metrics

In [123]:
stage_two = {}
for classifier, df in dataframes.items():
    filtered_df = df.loc[df['horacio_ROL'] == 1]
    stage_two[classifier] = filtered_df

EU_columns = [
    'model_pillar_1', 'model_pillar_2', 'model_pillar_3', 'model_pillar_4', 
    'model_pillar_5', 'model_pillar_6', 'model_pillar_7', 'model_pillar_8'
]
horacio_columns = [
    'horacio_pillar_1', 'horacio_pillar_2', 'horacio_pillar_3', 'horacio_pillar_4',
    'horacio_pillar_5', 'horacio_pillar_6', 'horacio_pillar_7', 'horacio_pillar_8'
]

stage_two_classification_performance = []

for classifier in stage_two:
    for i, (eu_col, horacio_col) in enumerate(zip(EU_columns, horacio_columns)):
        classifier_df = stage_two[classifier]

        metrics = calculate_performance(classifier_df, eu_col, horacio_col)

        new_row = {
            'classifier': classifier,
            'pillar': f'Pillar {i + 1}',
            'accuracy': metrics['accuracy'],
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'f1': metrics['f1'],
            'mcc': metrics['mcc'],
            'tpr' : metrics['tpr'],
            'fpr' : metrics['fpr'],
            'confusion_matrix': [metrics['confusion_matrix']],
            'true_pos' : metrics['true_pos'],
            'true_neg' : metrics['true_neg'],
            'false_pos' : metrics['false_pos'],
            'false_neg' : metrics['false_neg']
            }

        stage_two_classification_performance.append(new_row)

stage_two_classification_performance = pd.DataFrame(stage_two_classification_performance)
In [124]:
def map_pillar(column):
    if 'pillar_' in column:
        pillar_number = column.split('_')[-1]
        return f'Pillar {pillar_number}'
    
AI_performance['pillar'] = AI_performance['Model_Column'].apply(map_pillar)
In [125]:
AI_performance = AI_performance.rename(
    columns = {
        'Classifier' : 'classifier', 
        'confusion matrix'  : 'confusion_matrix'
    }
)
AI_performance = AI_performance.drop(columns = ['Model_Column', 'Horacio'])
performance = pd.concat([AI_performance, stage_two_classification_performance], ignore_index=True)
In [126]:
p1 = performance.loc[performance['pillar'] == "Pillar 1"]
pillar_one = []

p1_top = p1.loc[
    (p1['classifier'] != 'lcleary') & (p1['tpr'] >= 0.5) & (p1['fpr'] < .4)
]
p1_top['confusion_matrix'] = np.array(p1_top['confusion_matrix'])
eu_p1 = p1.loc[
    (p1['classifier'] != 'GPT') & (p1['classifier'] != 'Gemini')
]
eu_p1['confusion_matrix'] = np.array(eu_p1['confusion_matrix'])

gem_p1 = {
    'tpr' : p1.loc[(p1['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p1.loc[(p1['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p1.loc[(p1['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 1'
}

gpt_p1 = {
    'tpr': p1.loc[p1['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p1.loc[p1['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p1.loc[p1['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 1'
}

top_p1 = {
    'tpr': p1_top['tpr'].mean(),
    'fpr': p1_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p1_top['true_neg'].sum(), p1_top['false_pos'].sum()],
                        [p1_top['false_neg'].sum(), p1_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 1'
}

all_eu_p1 = {
    'tpr' : eu_p1['tpr'].mean(),
    'fpr' : eu_p1['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p1['true_neg'].sum(),eu_p1['false_pos'].sum()],
                        [eu_p1['false_neg'].sum(), eu_p1['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 1'
}

for i in [gpt_p1, gem_p1, top_p1, all_eu_p1]:
    pillar_one.append(i)

pillar_one = pd.DataFrame(pillar_one)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/406494460.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p1_top['confusion_matrix'] = np.array(p1_top['confusion_matrix'])
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/406494460.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p1['confusion_matrix'] = np.array(eu_p1['confusion_matrix'])
In [127]:
p2 = performance.loc[performance['pillar'] == "Pillar 2"]

pillar_two = []

p2_top = p2.loc[
    (p2['classifier'] != 'lcleary') & (p2['tpr'] >= 0.5) & (p2['fpr'] < .4)
]
eu_p2 = p2.loc[
    (p2['classifier'] != 'GPT') & (p2['classifier'] != 'Gemini')
]
eu_p2['confusion_matrix'] = np.array(eu_p2['confusion_matrix'])

gem_p2 = {
    'tpr' : p2.loc[(p2['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p2.loc[(p2['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p2.loc[(p2['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 2'
}

gpt_p2 = {
    'tpr': p2.loc[p2['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p2.loc[p2['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p2.loc[p2['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 2'
}

top_p2 = {
    'tpr': p2_top['tpr'].mean(),
    'fpr': p2_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p2_top['true_neg'].sum(), p2_top['false_pos'].sum()],
                        [p2_top['false_neg'].sum(), p2_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 2'
}

all_eu_p2 = {
    'tpr' : eu_p2['tpr'].mean(),
    'fpr' : eu_p2['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p2['true_neg'].sum(),eu_p2['false_pos'].sum()],
                        [eu_p2['false_neg'].sum(), eu_p2['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 2'
}

for i in [gpt_p2, gem_p2, top_p2, all_eu_p2]:
    pillar_two.append(i)

pillar_two = pd.DataFrame(pillar_two)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/1115769891.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p2['confusion_matrix'] = np.array(eu_p2['confusion_matrix'])
In [128]:
p3  = performance.loc[performance['pillar'] == "Pillar 3"]

pillar_three = []

p3_top = p3.loc[
    (p3['classifier'] != 'lcleary') & (p3['tpr'] >= 0.5) & (p3['fpr'] < .4)
]
eu_p3 = p3.loc[
    (p3['classifier'] != 'GPT') & (p3['classifier'] != 'Gemini')
]
eu_p3['confusion_matrix'] = np.array(eu_p3['confusion_matrix'])

gem_p3 = {
    'tpr' : p3.loc[(p3['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p3.loc[(p3['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p3.loc[(p3['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 3'
}

gpt_p3 = {
    'tpr': p3.loc[p3['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p3.loc[p3['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p3.loc[p3['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 3'
}

top_p3 = {
    'tpr': p3_top['tpr'].mean(),
    'fpr': p3_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p3_top['true_neg'].sum(), p3_top['false_pos'].sum()],
                        [p3_top['false_neg'].sum(), p3_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 3'
}


all_eu_p3 = {
    'tpr' : eu_p3['tpr'].mean(),
    'fpr' : eu_p3['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p3['true_neg'].sum(),eu_p3['false_pos'].sum()],
                        [eu_p3['false_neg'].sum(), eu_p3['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 3'
}

for i in [gpt_p3, gem_p3, top_p3, all_eu_p3]:
    pillar_three.append(i)

pillar_three = pd.DataFrame(pillar_two)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/4165617852.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p3['confusion_matrix'] = np.array(eu_p3['confusion_matrix'])
In [129]:
p4 = performance.loc[performance['pillar'] == "Pillar 4"]

pillar_four = []

p4_top = p4.loc[
    (p4['classifier'] != 'lcleary') & (p4['tpr'] >= 0.5) & (p4['fpr'] < .4)
]
eu_p4 = p4.loc[
    (p4['classifier'] != 'GPT') & (p4['classifier'] != 'Gemini')
]
eu_p4['confusion_matrix'] = np.array(eu_p4['confusion_matrix'])

gem_p4 = {
    'tpr' : p4.loc[(p4['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p4.loc[(p4['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p4.loc[(p4['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 4'
}

gpt_p4 = {
    'tpr': p4.loc[p4['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p4.loc[p4['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p4.loc[p4['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 4'
}

top_p4 = {
    'tpr': p4_top['tpr'].mean(),
    'fpr': p4_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p4_top['true_neg'].sum(), p4_top['false_pos'].sum()],
                        [p4_top['false_neg'].sum(), p4_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 4'
}


all_eu_p4 = {
    'tpr' : eu_p4['tpr'].mean(),
    'fpr' : eu_p4['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p4['true_neg'].sum(),eu_p4['false_pos'].sum()],
                        [eu_p4['false_neg'].sum(), eu_p4['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 4'
}

for i in [gpt_p4, gem_p4, top_p4, all_eu_p4]:
    pillar_four.append(i)

pillar_four = pd.DataFrame(pillar_four)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/2076089035.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p4['confusion_matrix'] = np.array(eu_p4['confusion_matrix'])
In [130]:
p5 = performance.loc[performance['pillar'] == "Pillar 5"]

pillar_five = []

p5_top = p5.loc[
    (p5['classifier'] != 'lcleary') & (p5['tpr'] >= 0.5) & (p5['fpr'] < .4)
]
eu_p5 = p5.loc[
    (p5['classifier'] != 'GPT') & (p5['classifier'] != 'Gemini')
]
eu_p5['confusion_matrix'] = np.array(eu_p5['confusion_matrix'])

gem_p5 = {
    'tpr' : p5.loc[(p5['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p5.loc[(p5['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p5.loc[(p5['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 5'
}

gpt_p5 = {
    'tpr': p5.loc[p5['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p5.loc[p5['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p5.loc[p5['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 5'
}

top_p5 = {
    'tpr': p5_top['tpr'].mean(),
    'fpr': p5_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p5_top['true_neg'].sum(), p5_top['false_pos'].sum()],
                        [p5_top['false_neg'].sum(), p5_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 5'
}

all_eu_p5 = {
    'tpr' : eu_p5['tpr'].mean(),
    'fpr' : eu_p5['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p5['true_neg'].sum(),eu_p5['false_pos'].sum()],
                        [eu_p5['false_neg'].sum(), eu_p5['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 5'
}

for i in [gpt_p5, gem_p5, top_p5, all_eu_p5]:
    pillar_five.append(i)

pillar_five = pd.DataFrame(pillar_five)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/775791368.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p5['confusion_matrix'] = np.array(eu_p5['confusion_matrix'])
In [131]:
p6 = performance.loc[performance['pillar'] == "Pillar 6"]

pillar_six = []

p6_top = p6.loc[
    (p6['classifier'] != 'lcleary') & (p6['tpr'] >= 0.5) & (p6['fpr'] < .4)
]
eu_p6 = p6.loc[
    (p6['classifier'] != 'GPT') & (p6['classifier'] != 'Gemini')
]
eu_p6['confusion_matrix'] = np.array(eu_p6['confusion_matrix'])

gem_p6 = {
    'tpr' : p6.loc[(p6['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p6.loc[(p6['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p6.loc[(p6['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 6'
}

gpt_p6 = {
    'tpr': p6.loc[p6['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p6.loc[p6['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p6.loc[p6['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 6'
}

top_p6 = {
    'tpr': p6_top['tpr'].mean(),
    'fpr': p6_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p6_top['true_neg'].sum(), p6_top['false_pos'].sum()],
                        [p6_top['false_neg'].sum(), p6_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 6'
}

all_eu_p6 = {
    'tpr' : eu_p6['tpr'].mean(),
    'fpr' : eu_p6['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p6['true_neg'].sum(),eu_p6['false_pos'].sum()],
                        [eu_p6['false_neg'].sum(), eu_p6['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 6'
}

for i in [gpt_p6, gem_p6, top_p6, all_eu_p6]:
    pillar_six.append(i)

pillar_six = pd.DataFrame(pillar_six)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/4058969709.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p6['confusion_matrix'] = np.array(eu_p6['confusion_matrix'])
In [132]:
p7 = performance.loc[performance['pillar'] == "Pillar 7"]

pillar_seven = []

p7_top = p7.loc[
    (p7['classifier'] != 'lcleary') & (p7['tpr'] >= 0.5) & (p7['fpr'] < .4)
]
eu_p7 = p7.loc[
    (p7['classifier'] != 'GPT') & (p7['classifier'] != 'Gemini')
]
eu_p7['confusion_matrix'] = np.array(eu_p7['confusion_matrix'])

gem_p7 = {
    'tpr' : p7.loc[(p7['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p7.loc[(p7['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p7.loc[(p7['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 7'
}

gpt_p7= {
    'tpr': p7.loc[p7['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p7.loc[p7['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p7.loc[p7['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 7'
}

top_p7 = {
    'tpr': p7_top['tpr'].mean(),
    'fpr': p7_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p7_top['true_neg'].sum(), p7_top['false_pos'].sum()],
                        [p7_top['false_neg'].sum(), p7_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 7'
}

all_eu_p7 = {
    'tpr' : eu_p7['tpr'].mean(),
    'fpr' : eu_p7['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p7['true_neg'].sum(),eu_p7['false_pos'].sum()],
                        [eu_p7['false_neg'].sum(), eu_p7['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 7'
}

for i in [gpt_p7, gem_p7, top_p7, all_eu_p7]:
    pillar_seven.append(i)

pillar_seven = pd.DataFrame(pillar_seven)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/3508723912.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p7['confusion_matrix'] = np.array(eu_p7['confusion_matrix'])
In [133]:
p8 = performance.loc[performance['pillar'] == "Pillar 8"]

pillar_eight = []

p8_top = p8.loc[
    (p8['classifier'] != 'lcleary') & (p8['tpr'] >= 0.5) & (p8['fpr'] < .4)
]
eu_p8 = p8.loc[
    (p8['classifier'] != 'GPT') & (p8['classifier'] != 'Gemini')
]
eu_p8['confusion_matrix'] = np.array(eu_p8['confusion_matrix'])

gem_p8 = {
    'tpr' : p8.loc[(p8['classifier'] == 'Gemini')]['tpr'].iloc[0],
    'fpr' : p8.loc[(p8['classifier'] == 'Gemini')]['fpr'].iloc[0],
    'confusion_matrix' : np.array(p8.loc[(p8['classifier'] == 'Gemini')]['confusion_matrix'].iloc[0]),
    'classifier' : 'Gemini',
    'pillar' : 'Pillar 8'
}

gpt_p8 = {
    'tpr': p8.loc[p8['classifier'] == 'GPT', 'tpr'].iloc[0],
    'fpr': p8.loc[p8['classifier'] == 'GPT', 'fpr'].iloc[0],  
    'confusion_matrix': np.array(p8.loc[p8['classifier'] == 'GPT', 'confusion_matrix'].iloc[0]),
    'classifier': 'GPT',
    'pillar': 'Pillar 8'
}

top_p8 = {
    'tpr': p8_top['tpr'].mean(),
    'fpr': p8_top['fpr'].mean(),
    'confusion_matrix' : np.array([[p8_top['true_neg'].sum(), p8_top['false_pos'].sum()],
                        [p8_top['false_neg'].sum(), p8_top['true_pos'].sum()]]),
    'classifier': 'Top Human Classifiers',
    'pillar': 'Pillar 8'
}


all_eu_p8 = {
    'tpr' : eu_p8['tpr'].mean(),
    'fpr' : eu_p8['fpr'].mean(),
    'confusion_matrix' : np.array([[eu_p8['true_neg'].sum(),eu_p8['false_pos'].sum()],
                        [eu_p8['false_neg'].sum(), eu_p8['true_pos'].sum()]]),
    'classifier' : 'Human Classifiers',
    'pillar' : 'Pillar 8'
}

for i in [gpt_p8, gem_p8, top_p8, all_eu_p8]:
    pillar_eight.append(i)

pillar_eight = pd.DataFrame(pillar_eight)
/var/folders/7x/fdwfv0y13yz0y3sjb4mwznqm0000gp/T/ipykernel_47536/1783717482.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eu_p8['confusion_matrix'] = np.array(eu_p8['confusion_matrix'])
In [134]:
In [134]:
dataframes = [pillar_one, pillar_four, pillar_five]
stage_two = pd.concat(dataframes)
stage_two[["classifier", "pillar", "tpr", "fpr", "confusion_matrix"]].rename(
        columns = {
            "classifier"       : "Classifier",
            "pillar"           : "Pillar",
            "tpr"              : "TPR",
            "fpr"              : "FPR",
            "confusion_matrix" : "Confussion Matrix"
        },
        # inplace = True
    ).style.hide(axis="index").format({
    "TPR": "{:,.2f}",
    "FPR": "{:,.2f}"
})
Table 2: Classification Performance Assessment (Stage 2)
Classifier Pillar TPR FPR Confussion Matrix
GPT Pillar 1 0.69 0.21 [[30 8] [ 9 20]]
Gemini Pillar 1 0.52 0.37 [[24 14] [14 15]]
Top Human Classifiers Pillar 1 0.55 0.16 [[89 27] [38 52]]
Human Classifiers Pillar 1 0.35 0.07 [[85 7] [50 25]]
GPT Pillar 4 0.60 0.17 [[35 7] [10 15]]
Gemini Pillar 4 0.24 0.26 [[31 11] [19 6]]
Top Human Classifiers Pillar 4 0.66 0.15 [[57 11] [20 37]]
Human Classifiers Pillar 4 0.40 0.16 [[62 13] [53 39]]
GPT Pillar 5 0.33 0.10 [[44 5] [12 6]]
Gemini Pillar 5 0.22 0.14 [[42 7] [14 4]]
Top Human Classifiers Pillar 5 0.67 0.18 [[69 15] [ 8 16]]
Human Classifiers Pillar 5 0.42 0.18 [[106 25] [ 19 17]]