from tqdm import tqdm

# Maths modules
import numpy as np
import pandas as pd
import tensorflow as tf

# Load data from CSV
df = pd.read_csv(
    "../input/sentiment140/training.1600000.processed.noemoticon.csv",
    names=["target", "id", "date", "flag", "user", "text"],
    encoding="ISO-8859-1",
)

# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target.replace(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    },
    inplace=True,
)

df.target.replace(
    {
        "NEGATIVE": 0,
        "POSITIVE": 1,
    },
    inplace=True,
)

# Sample data for development
TEXT_SAMPLE_SIZE = 1000000  # <= 0 for all

# Sample data
if TEXT_SAMPLE_SIZE > 0:
    df = (
        df.groupby("target", group_keys=False)
        .apply(
            lambda x: x.sample(
                n=int(TEXT_SAMPLE_SIZE / df["target"].nunique()), random_state=42
            )
        )
        .reset_index(drop=True)
    )


# Tokenizers, Stemmers and Lemmatizers
from transformers import AutoTokenizer

BERT_MODEL = "bert-base-uncased"  # "vinai/bertweet-base"
MAX_LENGTH = 50

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)

input_ids = np.asarray(
    [
        tokenizer(sent, max_length=MAX_LENGTH, padding="max_length", truncation=True)[
            "input_ids"
        ]
        for sent in tqdm(df.text)
    ]
)
attention_mask = np.asarray(
    [
        tokenizer(sent, max_length=MAX_LENGTH, padding="max_length", truncation=True)[
            "attention_mask"
        ]
        for sent in tqdm(df.text)
    ]
)
token_type_ids = np.asarray(
    [
        tokenizer(sent, max_length=MAX_LENGTH, padding="max_length", truncation=True)[
            "token_type_ids"
        ]
        for sent in tqdm(df.text)
    ]
)

100%|██████████| 1000000/1000000 [02:22<00:00, 7032.86it/s]
100%|██████████| 1000000/1000000 [02:20<00:00, 7106.50it/s]
100%|██████████| 1000000/1000000 [02:23<00:00, 6982.10it/s]


from sklearn.model_selection import train_test_split


# Train-test split
(
    texts_train,
    texts_test,
    input_ids_train,
    input_ids_test,
    attention_mask_train,
    attention_mask_test,
    token_type_ids_train,
    token_type_ids_test,
    labels_train,
    labels_test,
) = train_test_split(
    df.text.values,
    input_ids,
    attention_mask,
    token_type_ids,
    df.target.values,
    test_size=0.2,
    stratify=df.target.values,
    random_state=42,
)


from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy


# Define NN model
print("Defining model...")
model = TFAutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)

# compile NN network
print("Compiling model...")
model.compile(
    loss=BinaryCrossentropy(),
    optimizer=Adam(learning_rate=2e-5),  # Value recommended by the Bert team
    metrics=BinaryAccuracy(),
)

# fit NN model
print("Fitting model...")
model.fit(
    [input_ids_train, attention_mask_train, token_type_ids_train],
    labels_train,
    epochs=10,
    batch_size=8,
    validation_split=0.2,
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=3),
    ],
    workers=4,
    use_multiprocessing=True,
)

print(model.summary())

Defining model...

2022-01-28 10:48:00.445844: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:00.446928: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:00.447582: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:00.448433: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-28 10:48:00.449713: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:00.450401: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:00.451007: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:04.730174: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:04.730933: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:04.731616: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-28 10:48:04.732194: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14941 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Compiling model...
Fitting model...

2022-01-28 10:48:08.923912: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)

Epoch 1/10
80000/80000 [==============================] - 5748s 72ms/step - loss: 0.4389 - binary_accuracy: 0.8235 - val_loss: 0.3765 - val_binary_accuracy: 0.8500
Epoch 2/10
80000/80000 [==============================] - 5723s 72ms/step - loss: 0.5091 - binary_accuracy: 0.8428 - val_loss: 0.4480 - val_binary_accuracy: 0.8487
Epoch 3/10
80000/80000 [==============================] - 5751s 72ms/step - loss: 0.4641 - binary_accuracy: 0.8516 - val_loss: 0.4500 - val_binary_accuracy: 0.8436
Epoch 4/10
80000/80000 [==============================] - 5756s 72ms/step - loss: 0.4057 - binary_accuracy: 0.8482 - val_loss: 0.4579 - val_binary_accuracy: 0.8421
Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
bert (TFBertMainLayer)       multiple                  109482240
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0
_________________________________________________________________
classifier (Dense)           multiple                  1538
=================================================================
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
None


y_pred = model.predict([input_ids_test, attention_mask_test, token_type_ids_test])
y_pred_proba = [float(x[1]) for x in tf.nn.softmax(y_pred.logits)]
y_pred_label = [0 if x[0] > x[1] else 1 for x in tf.nn.softmax(y_pred.logits)]


from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)

print("Confusion Matrix : ")
print(confusion_matrix(labels_test, y_pred_label))

print("ROC AUC score : ", round(roc_auc_score(labels_test, y_pred_proba), 3))

print(
    "Average Precision score : ",
    round(average_precision_score(labels_test, y_pred_proba), 3),
)

Confusion Matrix :
[[17631 82369]
 [  415 99585]]
ROC AUC score :  0.883
Average Precision score :  0.822


from random import randrange


idx = randrange(len(texts_test))

print("Text : ", texts_test[idx])
print("True sentiment : ", labels_test[idx])
print("Predicted sentiment : ", round(y_pred_proba[idx], 3))

Text :  one word: exhausted.  i swear to god my back has broken or somthing. I am in absolute AGONY. worst pain i've ever felt.
True sentiment :  0
Predicted sentiment :  0.436

HuggingFace : BERT Fine-tuning¶

Load project modules and data¶

Text preprocessing¶

Model fine-tuning¶

Results and evaluation¶

Vanilla BERT model : `bert-base-uncased`¶

English tweets adapted model : `vinai/bertweet-base`¶

HuggingFace : BERT Fine-tuning¶

Load project modules and data¶

Text preprocessing¶

Model fine-tuning¶

Results and evaluation¶

Vanilla BERT model : bert-base-uncased¶

English tweets adapted model : vinai/bertweet-base¶

Vanilla BERT model : `bert-base-uncased`¶

English tweets adapted model : `vinai/bertweet-base`¶