2. Translation code

In [1]:

import os
import time
import pandas as pd
import math
import nltk
from nltk.tokenize import sent_tokenize
from deep_translator import GoogleTranslator
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ctoruno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

In [3]:

country = "Romania"
path2SP = ".../Data/data-extraction-1"

In [4]:

master_data = pd.read_parquet(f"{path2SP}/data4translation/{country}_tp.parquet.gzip")

In [5]:

def trans2english(text, sourcelang):
    """
    This functions gathers a text in a specific language and it returns its equivalent
    in English using the Google translation engine.

    Parameters:
        text:       String. Text to translate.
        sourcelang: String. Code of the source language you want to translate the text from.
    """
    if text:
      try:
          sentences = sent_tokenize(text)
          batch  = GoogleTranslator(source = sourcelang, target = "en").translate_batch(sentences)
          result = " ".join(batch)
          return result
      except Exception as e:
          out = f"Translation through API failed. Reason: {e}"
          return out
    else:
      return "No information available. No translation performed"

In [2]:

total_batches = math.ceil(len(master_data)/2000)
for batch_number in range(0, total_batches):
    print("======================================================================")
    print(f"Currently running batch number: {batch_number} out of {total_batches}")
    
    mode          = "overnight" # One of two values: "9to5" OR "overnight"
    counter_day   = 0
    counter_night = batch_number

    batch = counter_day + counter_night + 1
    if mode == "9to5":
        batch_size   = 500
        mode_counter = counter_day
    if mode == "overnight":
        batch_size   = 2000
        mode_counter = counter_night

    starting_row = (counter_day*500)+(counter_night*2000)
    final_row    = starting_row+batch_size
    batch_subset = master_data.copy().iloc[starting_row:final_row]

    print(f"Starting row: {starting_row}")

    batch_subset[["title_trans", "description_trans", "content_trans"]] = batch_subset.apply(
        lambda row: row[["title", "description", "content"]].apply(lambda x: trans2english(text = x, sourcelang = row["language"])),
        axis = 1
    )

    print("Translation completed")
    batch_subset.to_parquet(f"{path2SP}/translation-batches/{country}/{country}_batch_{batch}_{mode}_{mode_counter}.parquet.gzip", compression = "gzip")

    time.sleep(25)