import pandas as pd
import os
import requests
import json
import time
from dotenv import load_dotenv
In [1]:
Loading API Key
In [3]:
# Loading API KEY from environment
load_dotenv()= os.getenv("newscatcher_key_v2")
newscatcher_key_v2 = os.getenv("newscatcher_key_v3") newscatcher_key_v3
Checking Subscription Plan (only for V3)
In [195]:
# Defining URL, header, and parameters
= "https://v3-api.newscatcherapi.com/api/subscription"
url = {"x-api-token" : newscatcher_key_v3}
headers
# Sending the GET
= requests.get(url, headers=headers)
response
# Parsening results
= json.loads(response.text)
json_format print(json_format)
{'active': True, 'concurrent_calls': 1, 'plan': 'v3_nlp', 'plan_calls': 10000, 'remaining_calls': 7557, 'historical_days': 60}
Loading Keywords and Sources
In [8]:
= pd.read_excel("../inputs/EU_sources_1.xlsx")
sources "shortURL"] = sources["URL"].replace(r"^https?://|www\.|/", "", regex=True) sources[
In [88]:
= pd.read_excel("../inputs/keywords_1.xlsx")
keywords = pd.melt(
keywords_long
keywords, = "Group",
id_vars = "language",
var_name = "keyword"
value_name )
In [93]:
= pd.read_excel("../inputs/country_institutions_1.xlsx").dropna(subset=["translation"]) institutions
Gathering News Articles
In [198]:
def gatherKeywords(language, country):
"""
This function takes a language and a country as arguments and retrieve a set of query-styled strings
from the keywords and institutions data frames.
"""
= []
output
# Collapsing keyword batches
= sorted(keywords_long["Group"].unique())
batches for batch in batches:
= (
subset
keywords_long
.copy()"language"] == language]
.loc[keywords_long["Group"] == batch]
.loc[keywords_long[
)= " OR ".join(['"' + word + '"' for word in subset.keyword])
query_style = query_style.replace("/", '" OR "')
query_style
output.append(query_style)
# Collapsing institutional names
= (
institutional_names
institutions
.copy()"country"] == country]
.loc[institutions[
)= " OR ".join(['"' + word + '"' for word in institutional_names.translation])
query_style
output.append(query_style)
return output
def newsFetcher(query, source, date_0 = "7 months ago", date_1 = "2 months ago", v2 = True):
"""
This function takes a query and a news source as inputs and returns a data frame with all the results of that specific query
through either V2 or V3 of the Newscatcher API version.
"""
# Defining initial counters
= 1
page = 100
npage
# Creating an empty list to store results
= []
outputs
while page <= npage:
# Defining URL, header, and parameters
if v2 == True:
= "https://api.newscatcherapi.com/v2/search?"
url = {"x-api-key" : newscatcher_key_v2}
headers = {
params "q" : query,
"sources" : source,
"page" : page,
"page_size" : 100,
"from" : date_0,
"to" : date_1,
"sort_by" : "date"
}else:
= "https://v3-api.newscatcherapi.com/api/search?"
url = {"x-api-token" : newscatcher_key_v3}
headers = {
params "q" : query,
"sources" : source,
"page" : page,
"page_size" : 1000,
"from_" : date_0,
"to_" : date_1,
"sort_by" : "date"
}
# Sending a GET call
= requests.get(url, params=params, headers=headers)
response 1) # The API has a restriction of 1 call per second
time.sleep(
# Parsening the response in a JSON format
= json.loads(response.text)
json_data
# Updating counters
= json_data["total_pages"]
npage = json_data["total_hits"]
total_hits
if total_hits > 0:
# Converting results to pandas data frame
= pd.DataFrame(json_data["articles"])
df = df.iloc[-1]["published_date"]
min_date
outputs.append(df)
# Increasing/Reseting counts
if total_hits < 10000 or page < npage:
= page + 1
page else:
= 1
page = min_date
date_1
# Merging entire list of data frames
if outputs:
= pd.concat(outputs, ignore_index=True)
results else:
= None
results
return results
def extractNews(source, country, language, from_ = "7 months ago", to_ = "2 months ago", v2 = True):
"""
This functions takes a source's URL, the country it belongs to, and the language of the publication and
it retrieves news articles associated to the pre-defined queries within that news source.
"""
print(f"Extracting news articles from: {source}")
# Creating an empty dictionary to store results
= {}
results_per_batch
# Iterating across batches of keywords/queries
= gatherKeywords(language, country)
qbatches for n, batch in enumerate(qbatches, start = 1):
print(f"===== Extracting articles from Batch #{n}")
= f"Batch {n}"
batch_name = newsFetcher(batch, source, from_, to_, v2)
fetched_news dict = {
batch_name : fetched_news
}dict)
results_per_batch.update(
# Defining the outcome
= {
output
source : results_per_batch
}
return output
def mergeData(dta, version):
"""
This function takes a list containing all the returned data from the API and compiles it into a data set
"""
for element in dta:
for source, batches in element.items():
print(source)
= [data for batch, data in batches.items()]
data_list = all(data is None for data in data_list)
empty_source if not empty_source:
= pd.concat(data_list)
master_data f"../data/data-extraction-1/{version}/{source}.parquet.gzip", compression = "gzip") master_data.to_parquet(
Using API V2 to gather news
In [7]:
= (
results_list_v2
sources# .loc[64:64]
apply(lambda row: extractNews(row["shortURL"], row["Country"], row["Language"]), axis = 1)
.
.tolist() )
In [8]:
= "v2") mergeData(results_list_v2, version
Using API V3 to gather news
In [10]:
= (
results_list_v3
sources# .loc[(sources["Priority"] == "Yes") & (sources["HP"] == "No")]
"Priority"] == "No"]
.loc[sources[apply(lambda row: extractNews(
."shortURL"], row["Country"], row["Language"],
row[= "2 months ago", to_ = "1 day ago", v2 = False
from_ = 1)
), axis
.tolist() )
In [11]:
= "v3") mergeData(results_list_v3, version
Compiling and saving data
In [112]:
= os.listdir("../data/data-extraction-1/v2")
files_v2 = [
data_sources_v2 f"../data/data-extraction-1/v2/{x}")
pd.read_parquet(for x in files_v2
]= pd.concat(data_sources_v2).drop_duplicates(subset = "_id")
master_v2 "../data/data-extraction-1/master_1v2.parquet.gzip", compression = "gzip") master_v2.to_parquet(
In [201]:
= os.listdir("../data/data-extraction-1/v3")
files_v3 = [
data_sources_v3 f"../data/data-extraction-1/v3/{x}")
pd.read_parquet(for x in files_v3
]= pd.concat(data_sources_v3).drop_duplicates(subset = "id")
master_v3 "../data/data-extraction-1/master_1v3.parquet.gzip", compression = "gzip") master_v3.to_parquet(
Harmonizing V2 and V3 data
In [6]:
= pd.read_parquet("../data/data-extraction-1/master_1v2.parquet.gzip")
master_v2 = pd.read_parquet("../data/data-extraction-1/master_1v3.parquet.gzip") master_v3
In [16]:
= ["id", "link", "domain_url", "published_date", "title", "description", "content", "language", "is_opinion"]
target_columns = (
master_v2_harm
master_v2
.rename(= {
columns "excerpt" : "description",
"summary" : "content",
"clean_url" : "domain_url",
"_id" : "id"
}
)
.loc[:, target_columns]
)= master_v3.loc[:, target_columns]
master_v3_harm = (
master_data
pd.concat([master_v2_harm, master_v3_harm])
.drop_duplicates()
)= master_data.loc[master_data[["title", "content"]].notna().all(axis=1)] master_data
Saving data
In [19]:
"../data/data-extraction-1/master_extraction_1.parquet.gzip", compression = "gzip") master_data.to_parquet(
Subsetting data for translation
In [5]:
= pd.read_parquet("../data/data-extraction-1/master_extraction_1.parquet.gzip") master_data
In [15]:
= {
top_journals "Luxembourg" : ['lequotidien.lu','tageblatt.lu','wort.lu'],
"Slovakia" : ['dennikn.sk','sme.sk', 'pravda.sk'],
"Hungary" : ['magyarhirlap.hu','magyarnemzet.hu','nepszava.hu'],
"Ireland" : ['irishexaminer.com','independent.ie','irishtimes.com'],
"Bulgaria" : ['24chasa.bg','capital.bg','dnevnik.bg'],
"Greece" : ['kathimerini.gr','protothema.gr','tanea.gr'],
"Portugal" : ['expresso.pt','publico.pt','jn.pt'],
"Cyprus" : ['philenews.com','politis.com.cy','sigmalive.com'],
"Czechia" : ['blesk.cz','denik.cz','lidovky.cz'],
"Estonia" : ['delfi.ee','postimees.ee'],
"Finland" : ['aamulehti.fi','hs.fi','ksml.fi'],
"France" : ['liberation.fr','lefigaro.fr','lemonde.fr'],
"Slovenia" : ['delo.si','dnevnik.si','vecer.com'],
"Spain" : ['abc.es','elmundo.es','elpais.com','lavanguardia.com'],
"Sweden" : ['dn.se','gp.se','svd.se'],
"Austria" : ['derstandard.at','diepresse.com','sn.at'],
"Belgium" : ['hln.be','lesoir.be','standaard.be'],
"Denmark" : ['berlingske.dk','jyllands-posten.dk','politiken.dk'],
"Germany" : ['faz.net','spiegel.de','sueddeutsche.de','zeit.de'],
"Italy" : ['corriere.it','lastampa.it','repubblica.it'],
"Netherlands" : ['ad.nl','nrc.nl','telegraaf.nl','volkskrant.nl'],
"Croatia" : ['jutarnji.hr','novilist.hr','vecernji.hr'],
"Malta" : ['independent.com.mt','timesofmalta.com','talk.mt'],
"Latvia" : ['db.lv','ir.lv','la.lv'],
"Lithuania" : ['baltic-review.com','lrytas.lt','ve.lt'],
"Poland" : ['fakt.pl','rp.pl','wyborcza.pl'],
"Romania" : ['adevarul.ro','evz.ro','libertatea.ro']
}
def assign_country(domain_url):
for country, urls in top_journals.items():
if domain_url in urls:
return country
return "Low priority journal"
"country"] = master_data["domain_url"].apply(assign_country) master_data[
In [37]:
for country in sources["Country"].drop_duplicates().to_list():
= (
subset4translation
master_data.copy()"country"] == country) & (master_data["is_opinion"] == False)]
.loc[(master_data[
)print(f"{country}: {len(subset4translation)}")
= f"../data/data-extraction-1/data4translation/{country}_tp.parquet.gzip"
file_name = "gzip") subset4translation.to_parquet(file_name, compression
Austria: 46145
Belgium: 21287
Bulgaria: 38118
Croatia: 37068
Cyprus: 34955
Czechia: 41415
Denmark: 11312
Estonia: 12370
Finland: 6647
France: 64527
Germany: 45321
Greece: 49504
Hungary: 18111
Ireland: 48409
Italy: 93858
Latvia: 5487
Lithuania: 14396
Luxembourg: 7894
Malta: 10842
Netherlands: 23935
Poland: 21434
Portugal: 29624
Romania: 33264
Slovakia: 34874
Slovenia: 10211
Spain: 112820
Sweden: 6417