Back to Article
1. Extraction code
Download Notebook

1. Extraction code

In [1]:
import pandas as pd
import os
import requests
import json
import time
from dotenv import load_dotenv

Loading API Key

In [3]:
# Loading API KEY from environment
load_dotenv()
newscatcher_key_v2 = os.getenv("newscatcher_key_v2")
newscatcher_key_v3 = os.getenv("newscatcher_key_v3")

Checking Subscription Plan (only for V3)

In [195]:
# Defining URL, header, and parameters
url      = "https://v3-api.newscatcherapi.com/api/subscription"
headers  = {"x-api-token" : newscatcher_key_v3}

# Sending the GET
response = requests.get(url, headers=headers)

# Parsening results
json_format = json.loads(response.text)
print(json_format)
{'active': True, 'concurrent_calls': 1, 'plan': 'v3_nlp', 'plan_calls': 10000, 'remaining_calls': 7557, 'historical_days': 60}

Loading Keywords and Sources

In [8]:
sources = pd.read_excel("../inputs/EU_sources_1.xlsx")
sources["shortURL"] = sources["URL"].replace(r"^https?://|www\.|/", "", regex=True)
In [88]:
keywords      = pd.read_excel("../inputs/keywords_1.xlsx")
keywords_long = pd.melt(
    keywords, 
    id_vars    = "Group", 
    var_name   = "language", 
    value_name = "keyword"
)
In [93]:
institutions = pd.read_excel("../inputs/country_institutions_1.xlsx").dropna(subset=["translation"])

Gathering News Articles

In [198]:
def gatherKeywords(language, country):
    """
    This function takes a language and a country as arguments and retrieve a set of query-styled strings
    from the keywords and institutions data frames.
    """

    output = []

    # Collapsing keyword batches
    batches = sorted(keywords_long["Group"].unique())
    for batch in batches:
        subset  = (
            keywords_long
            .copy()
            .loc[keywords_long["language"] == language]
            .loc[keywords_long["Group"] == batch]
        )
        query_style = " OR ".join(['"' + word + '"' for word in subset.keyword])
        query_style = query_style.replace("/", '" OR "')
        output.append(query_style)
    
    # Collapsing institutional names
    institutional_names = (
        institutions
        .copy()
        .loc[institutions["country"] == country]
    )
    query_style = " OR ".join(['"' + word + '"' for word in institutional_names.translation])
    output.append(query_style)

    return output

def newsFetcher(query, source, date_0 = "7 months ago", date_1 = "2 months ago", v2 = True):
    """
    This function takes a query and a news source as inputs and returns a data frame with all the results of that specific query 
    through either V2 or V3 of the Newscatcher API version.
    """

    # Defining initial counters
    page   = 1
    npage  = 100

    # Creating an empty list to store results
    outputs = []

    while page <= npage:

        # Defining URL, header, and parameters
        if v2 == True:
            url      = "https://api.newscatcherapi.com/v2/search?"
            headers  = {"x-api-key" : newscatcher_key_v2}
            params   = {
                "q"         : query,
                "sources"   : source,
                "page"      : page,
                "page_size" : 100,
                "from"      : date_0,
                "to"        : date_1,
                "sort_by"   : "date"
            }
        else:
            url      = "https://v3-api.newscatcherapi.com/api/search?"
            headers  = {"x-api-token" : newscatcher_key_v3}
            params   = {
                "q"         : query,
                "sources"   : source,
                "page"      : page,
                "page_size" : 1000,
                "from_"     : date_0,
                "to_"       : date_1,
                "sort_by"   : "date"
            }
    
        # Sending a GET call
        response = requests.get(url, params=params, headers=headers)
        time.sleep(1) # The API has a restriction of 1 call per second

        # Parsening the response in a JSON format
        json_data = json.loads(response.text)

        # Updating counters
        npage      = json_data["total_pages"]
        total_hits = json_data["total_hits"]

        if total_hits > 0:
        
            # Converting results to pandas data frame
            df = pd.DataFrame(json_data["articles"])
            min_date   = df.iloc[-1]["published_date"]
            outputs.append(df)

            # Increasing/Reseting counts
            if total_hits < 10000 or page < npage:
                page = page + 1
            else:
                page   = 1
                date_1 = min_date 
    
    # Merging entire list of data frames
    if outputs:
        results = pd.concat(outputs, ignore_index=True)
    else:
        results = None

    return results

def extractNews(source, country, language, from_ = "7 months ago", to_ = "2 months ago", v2 = True):
    """
    This functions takes a source's URL, the country it belongs to, and the language of the publication and 
    it retrieves news articles associated to the pre-defined queries within that news source.
    """

    print(f"Extracting news articles from: {source}")
    
    # Creating an empty dictionary to store results
    results_per_batch = {}

    # Iterating across batches of keywords/queries
    qbatches = gatherKeywords(language, country)
    for n, batch in enumerate(qbatches, start = 1):
        print(f"===== Extracting articles from Batch #{n}")
        batch_name   = f"Batch {n}"
        fetched_news = newsFetcher(batch, source, from_, to_, v2)
        dict = {
            batch_name : fetched_news
        }
        results_per_batch.update(dict)

    # Defining the outcome
    output = {
        source : results_per_batch
    }

    return output

def mergeData(dta, version):
    """
    This function takes a list containing all the returned data from the API and compiles it into a data set
    """
    for element in dta:
        for source, batches in element.items():
            print(source)
            data_list    = [data for batch, data in batches.items()]
            empty_source = all(data is None for data in data_list)
            if not empty_source:
                master_data  = pd.concat(data_list)
                master_data.to_parquet(f"../data/data-extraction-1/{version}/{source}.parquet.gzip", compression = "gzip")    

Using API V2 to gather news

In [7]:
results_list_v2 = (
    sources
    # .loc[64:64]
    .apply(lambda row: extractNews(row["shortURL"], row["Country"], row["Language"]), axis = 1)
    .tolist()
)
In [8]:
mergeData(results_list_v2, version = "v2")

Using API V3 to gather news

In [10]:
results_list_v3 = (
    sources
    # .loc[(sources["Priority"] == "Yes") & (sources["HP"] == "No")]
    .loc[sources["Priority"] == "No"]
    .apply(lambda row: extractNews(
        row["shortURL"], row["Country"], row["Language"],
        from_ = "2 months ago", to_ = "1 day ago", v2 = False
    ), axis = 1)
    .tolist()
)
In [11]:
mergeData(results_list_v3, version = "v3")

Compiling and saving data

In [112]:
files_v2 = os.listdir("../data/data-extraction-1/v2")
data_sources_v2 = [
    pd.read_parquet(f"../data/data-extraction-1/v2/{x}")
    for x in files_v2
]
master_v2 = pd.concat(data_sources_v2).drop_duplicates(subset = "_id")
master_v2.to_parquet("../data/data-extraction-1/master_1v2.parquet.gzip", compression = "gzip")
In [201]:
files_v3 = os.listdir("../data/data-extraction-1/v3")
data_sources_v3 = [
    pd.read_parquet(f"../data/data-extraction-1/v3/{x}")
    for x in files_v3
]
master_v3 = pd.concat(data_sources_v3).drop_duplicates(subset = "id")
master_v3.to_parquet("../data/data-extraction-1/master_1v3.parquet.gzip", compression = "gzip")

Harmonizing V2 and V3 data

In [6]:
master_v2 = pd.read_parquet("../data/data-extraction-1/master_1v2.parquet.gzip")
master_v3 = pd.read_parquet("../data/data-extraction-1/master_1v3.parquet.gzip")
In [16]:
target_columns = ["id", "link", "domain_url", "published_date", "title", "description", "content", "language", "is_opinion"]
master_v2_harm = (
    master_v2
    .rename(
        columns = {
            "excerpt"   : "description",
            "summary"   : "content",
            "clean_url" : "domain_url",
            "_id"       : "id"
        }
    )
    .loc[:, target_columns]
)
master_v3_harm = master_v3.loc[:, target_columns]
master_data    = (
    pd.concat([master_v2_harm, master_v3_harm])
    .drop_duplicates()
)
master_data = master_data.loc[master_data[["title", "content"]].notna().all(axis=1)]

Saving data

In [19]:
master_data.to_parquet("../data/data-extraction-1/master_extraction_1.parquet.gzip", compression = "gzip")

Subsetting data for translation

In [5]:
master_data = pd.read_parquet("../data/data-extraction-1/master_extraction_1.parquet.gzip")
In [15]:
top_journals = {
      "Luxembourg"  : ['lequotidien.lu','tageblatt.lu','wort.lu'],
      "Slovakia"    : ['dennikn.sk','sme.sk', 'pravda.sk'],
      "Hungary"     : ['magyarhirlap.hu','magyarnemzet.hu','nepszava.hu'],
      "Ireland"     : ['irishexaminer.com','independent.ie','irishtimes.com'],
      "Bulgaria"    : ['24chasa.bg','capital.bg','dnevnik.bg'],
      "Greece"      : ['kathimerini.gr','protothema.gr','tanea.gr'],
      "Portugal"    : ['expresso.pt','publico.pt','jn.pt'],
      "Cyprus"      : ['philenews.com','politis.com.cy','sigmalive.com'],
      "Czechia"     : ['blesk.cz','denik.cz','lidovky.cz'],
      "Estonia"     : ['delfi.ee','postimees.ee'],
      "Finland"     : ['aamulehti.fi','hs.fi','ksml.fi'],
      "France"      : ['liberation.fr','lefigaro.fr','lemonde.fr'],
      "Slovenia"    : ['delo.si','dnevnik.si','vecer.com'],
      "Spain"       : ['abc.es','elmundo.es','elpais.com','lavanguardia.com'],
      "Sweden"      : ['dn.se','gp.se','svd.se'],
      "Austria"     : ['derstandard.at','diepresse.com','sn.at'],
      "Belgium"     : ['hln.be','lesoir.be','standaard.be'],
      "Denmark"     : ['berlingske.dk','jyllands-posten.dk','politiken.dk'],
      "Germany"     : ['faz.net','spiegel.de','sueddeutsche.de','zeit.de'],
      "Italy"       : ['corriere.it','lastampa.it','repubblica.it'],
      "Netherlands" : ['ad.nl','nrc.nl','telegraaf.nl','volkskrant.nl'],
      "Croatia"     : ['jutarnji.hr','novilist.hr','vecernji.hr'],
      "Malta"       : ['independent.com.mt','timesofmalta.com','talk.mt'],
      "Latvia"      : ['db.lv','ir.lv','la.lv'],
      "Lithuania"   : ['baltic-review.com','lrytas.lt','ve.lt'],
      "Poland"      : ['fakt.pl','rp.pl','wyborcza.pl'],
      "Romania"     : ['adevarul.ro','evz.ro','libertatea.ro']
}

def assign_country(domain_url):
    for country, urls in top_journals.items():
        if domain_url in urls:
            return country
    return "Low priority journal"

master_data["country"] = master_data["domain_url"].apply(assign_country)
In [37]:
for country in sources["Country"].drop_duplicates().to_list():
    subset4translation = (
        master_data.copy()
        .loc[(master_data["country"] == country) & (master_data["is_opinion"] == False)]
    )
    print(f"{country}: {len(subset4translation)}")
    file_name = f"../data/data-extraction-1/data4translation/{country}_tp.parquet.gzip"
    subset4translation.to_parquet(file_name, compression = "gzip")
Austria: 46145
Belgium: 21287
Bulgaria: 38118
Croatia: 37068
Cyprus: 34955
Czechia: 41415
Denmark: 11312
Estonia: 12370
Finland: 6647
France: 64527
Germany: 45321
Greece: 49504
Hungary: 18111
Ireland: 48409
Italy: 93858
Latvia: 5487
Lithuania: 14396
Luxembourg: 7894
Malta: 10842
Netherlands: 23935
Poland: 21434
Portugal: 29624
Romania: 33264
Slovakia: 34874
Slovenia: 10211
Spain: 112820
Sweden: 6417