# Import custom helper libraries
import os
import sys

# Maths modules
import pandas as pd


src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.helpers as data_helpers
import visualization.helpers as viz_helpers


# Sample data for development
TEXT_SAMPLE_SIZE = 2000  # <= 0 for all


# Download and unzip CSV files
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.


# Load data from CSV
df = pd.read_csv(
    os.path.join("..", "data", "raw", "training.1600000.processed.noemoticon.csv"),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)

# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target = df.target.map(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    }
)

# Sample data
if TEXT_SAMPLE_SIZE > 0:
    df = data_helpers.balance_sample(df, "target", TEXT_SAMPLE_SIZE)

df.describe()


from models.custom_huggingface_sentiment_analysis_classifier import (
    CustomHuggingfaceSentimentAnalysisClassifier,
)


# Initialize Azure Text Analytics classifier
cls = CustomHuggingfaceSentimentAnalysisClassifier()

cache_json_path = os.path.join("..", "results", "huggingface_cache.json")
if os.path.exists(cache_json_path):
    # Load cached results
    cls.load_cache_json(filename=cache_json_path)
else:
    # Compute sentiment scores
    cls.fit(X=df.text.values, y=df.target.values)
    # Save results to cache
    cls.save_cache_json(filename=cache_json_path)

# Plot classification performances
viz_helpers.plot_classifier_results(
    cls,
    df.text.values,
    df.target.values,
    title="Classification results",
)

2022-01-31 12:33:05.546556: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-31 12:33:05.546594: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
2022-01-31 12:33:12.415728: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-31 12:33:12.415764: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-31 12:33:12.415791: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gros-bernard): /proc/driver/nvidia/version does not exist
2022-01-31 12:33:12.416294: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-31 12:33:12.438451: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


# Compute predictions
y_pred = cls.predict(df.text.values)
df["prediction"] = y_pred


import shap

shap.initjs()

explainer = shap.Explainer(cls.classifier)


# False positive example
fp_index = df[(df.target == "NEGATIVE") & (df.prediction == "POSITIVE")].index[0]
fp_text = df.text.values[fp_index]

shap_values = explainer([fp_text])

shap.plots.text(shap_values[0, :, "POSITIVE"])

Partition explainer: 2it [00:34, 34.54s/it]


# False negative example
fn_index = df[(df.target == "POSITIVE") & (df.prediction == "NEGATIVE")].index[0]
fn_text = df.text.values[fn_index]

shap_values = explainer([fn_text])

shap.plots.text(shap_values[0, :, "POSITIVE"])

Partition explainer: 2it [00:31, 31.59s/it]

HuggingFace Transformer Pipeline : Sentiment Analysis¶

Load project modules and data¶

Classification Model¶

HuggingFace's Transformer Pipeline : Sentiment Analysis model¶

	target	text
count	2000	2000
unique	2	2000
top	NEGATIVE	@xnausikaax oh no! where did u order from? tha...
freq	1000	1