import pickle

# Import custom helper libraries
import os
import sys

src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.helpers as data_helpers
import visualization.helpers as viz_helpers

# Maths modules
from scipy.stats import f_oneway
import pandas as pd

# Viz modules
import plotly.express as px

# Render for export
import plotly.io as pio

pio.renderers.default = "notebook"


# Download and unzip CSV files
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.


# Load data from CSV
df = pd.read_csv(
    os.path.join("..", "data", "raw", "training.1600000.processed.noemoticon.csv"),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)


# Display first few rows
df.head(5)


# Diaplay number of rows and colmn types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   target  1600000 non-null  Int8
 1   id      1600000 non-null  UInt32
 2   date    1600000 non-null  string
 3   flag    1600000 non-null  category
 4   user    1600000 non-null  string
 5   text    1600000 non-null  string
dtypes: Int8(1), UInt32(1), category(1), string(3)
memory usage: 48.8 MB


# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target = df.target.map(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    }
)


# Display basic statistics
df.describe(include="all")


# Plot target distribution
viz_helpers.histogram(
    df, label_x="target", label_colour="target", title="Target distribution"
)


# Plot text length distribution
df["text_length"] = df.text.str.len()

p_value = f_oneway(
    df.loc[df["target"] == "NEGATIVE", "text_length"],
    df.loc[df["target"] == "POSITIVE", "text_length"],
)[1]

viz_helpers.histogram(
    df,
    label_x="text_length",
    label_colour="target",
    title=f"Text length distribution / p-value={p_value:.5f}",
    include_boxplot=True,
)


# Plot word count distribution
df["word_count"] = df.text.str.split().str.len()

p_value = f_oneway(
    df.loc[df["target"] == "NEGATIVE", "word_count"],
    df.loc[df["target"] == "POSITIVE", "word_count"],
)[1]

viz_helpers.histogram(
    df,
    label_x="word_count",
    label_colour="target",
    title=f"Word count distribution / p-value={p_value:.5f}",
    include_boxplot=True,
)


# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenizers, Stemmers and Lemmatizers
import nltk
from nltk.corpus import stopwords
import spacy

# Download resources
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

# Download SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# Define tokenizer
tokenizer = lambda text: [  # SpaCy Lemmatizer
    token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop
]

2022-02-06 04:40:44.868994: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-06 04:40:44.869067: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/clement/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(
    processed_data_path, "tfidf_spacy_dataset.pkl"
)
vocabulary_file_path = os.path.join(processed_data_path, "tfidf_spacy_vocabulary.pkl")

if os.path.exists(vectorized_dataset_file_path) and os.path.exists(
    vocabulary_file_path
):
    # Load vectorized dataset
    with (open(vectorized_dataset_file_path, "rb")) as f:
        X = pickle.load(f)
    # Load vocabulary
    with (open(vocabulary_file_path, "rb")) as f:
        vocabulary = pickle.load(f)
else:
    # Define vectorizer
    vectorizer = TfidfVectorizer(
        strip_accents="unicode",
        lowercase=True,
        stop_words=stopwords,
        tokenizer=tokenizer,
    )

    # Vectorize text
    X = vectorizer.fit_transform(df.text)

    # Get vocabulary
    vocabulary = vectorizer.get_feature_names_out()

    # Save vectorized dataset as pickle
    with open(vectorized_dataset_file_path, "wb") as f:
        pickle.dump(X, f)

    # Save vocabulary as pickle
    with open(vocabulary_file_path, "wb") as f:
        pickle.dump(vocabulary, f)


# List words TF-IDF scores
words = pd.Series(X.sum(axis=0).A1, index=vocabulary)

# Top 20 tokens by TfIdf
top_20_words = words.nlargest(20).sort_values(ascending=False)

# Plot top 20 tokens by TfIdf
fig = px.bar(
    top_20_words,
    x=top_20_words.index,
    y=top_20_words.values,
    labels={"x": "Words", "y": "Count", "color": "Count"},
    title=f"Top 20 important words (Tf-Idf) - Vocalbulary size: {len(vocabulary)}",
    color=top_20_words.values,
)
fig.show()


models_results_df = pd.DataFrame(
    data=[
        {
            "Model": "1 - Logistic Regression",
            "True Positives": 108688,
            "True Negatives": 107510,
            "False Positives": 52490,
            "False Negatives": 51312,
            "Average Precision": 0.73,
            "ROC AUC": 0.74,
        },
        {
            "Model": "2 - Word embedding",
            "True Positives": 119134,
            "True Negatives": 105070,
            "False Positives": 54930,
            "False Negatives": 40866,
            "Average Precision": 0.75,
            "ROC AUC": 0.77,
        },
        {
            "Model": "3.1 - Azure Cognitive Service API",
            "True Positives": 788,
            "True Negatives": 673,
            "False Positives": 327,
            "False Negatives": 212,
            "Average Precision": 0.75,
            "ROC AUC": 0.78,
        },
        {
            "Model": "3.2 - Logistic Regression on Azure Cognitive Service",
            "True Positives": 164,
            "True Negatives": 123,
            "False Positives": 77,
            "False Negatives": 36,
            "Average Precision": 0.76,
            "ROC AUC": 0.78,
        },
        {
            "Model": "4 - HuggingFace Sentiment Analysis",
            "True Positives": 622,
            "True Negatives": 798,
            "False Positives": 202,
            "False Negatives": 378,
            "Average Precision": 0.79,
            "ROC AUC": 0.80,
        },
        {
            "Model": "5.1 - HuggingFace BERT Fine-tuning",
            "True Positives": 99585,
            "True Negatives": 17631,
            "False Positives": 82369,
            "False Negatives": 415,
            "Average Precision": 0.822,
            "ROC AUC": 0.883,
        },
        {
            "Model": "5.2 - HuggingFace BERTweet Fine-tuning",
            "True Positives": 89984,
            "True Negatives": 79351,
            "False Positives": 20649,
            "False Negatives": 10016,
            "Average Precision": 0.901,
            "ROC AUC": 0.915,
        },
        {
            "Model": "6.1 - AzureML Automated ML : 1h on CPU",
            "True Positives": 55214,
            "True Negatives": 48811,
            "False Positives": 23189,
            "False Negatives": 16786,
            "Average Precision": 0.79,
            "ROC AUC": 0.797,
        },
        {
            "Model": "6.2 - AzureML Automated ML : 10h on GPU",
            "True Positives": 111449,
            "True Negatives": 110539,
            "False Positives": 16552,
            "False Negatives": 17461,
            "Average Precision": 0.942,
            "ROC AUC": 0.942,
        },
        {
            "Model": "7.1 - AzureML Designer : Feature Hashing",
            "True Positives": 2114,
            "True Negatives": 2125,
            "False Positives": 1075,
            "False Negatives": 1086,
            "Average Precision": 0.663,
            "ROC AUC": 0.726,
        },
        {
            "Model": "7.2 - AzureML Designer : N-Gram Features",
            "True Positives": 2390,
            "True Negatives": 2285,
            "False Positives": 915,
            "False Negatives": 810,
            "Average Precision": 0.723,
            "ROC AUC": 0.811,
        },
        {
            "Model": "8.1 - FFNN on word counts",
            "True Positives": 129323,
            "True Negatives": 129822,
            "False Positives": 30178,
            "False Negatives": 30677,
            "Average Precision": 0.89,
            "ROC AUC": 0.89,
        },
        {
            "Model": "8.2 - FFNN on SpaCy Embedding",
            "True Positives": 126823,
            "True Negatives": 127274,
            "False Positives": 32726,
            "False Negatives": 33177,
            "Average Precision": 0.88,
            "ROC AUC": 0.88,
        },
        {
            "Model": "8.3 - FFNN on Gensim Doc2Vec Embedding",
            "True Positives": 119852,
            "True Negatives": 117321,
            "False Positives": 42679,
            "False Negatives": 40148,
            "Average Precision": 0.82,
            "ROC AUC": 0.82,
        },
        {
            "Model": "8.4 - FFNN with custom Embedding",
            "True Positives": 126888,
            "True Negatives": 130583,
            "False Positives": 29417,
            "False Negatives": 33112,
            "Average Precision": 0.88,
            "ROC AUC": 0.88,
        },
        {
            "Model": "8.5 - FFNN on Bert encoding",
            "True Positives": 128648,
            "True Negatives": 128653,
            "False Positives": 31347,
            "False Negatives": 31352,
            "Average Precision": 0.88,
            "ROC AUC": 0.88,
        },
        {
            "Model": "8.6 - RNN",
            "True Positives": 123295,
            "True Negatives": 134909,
            "False Positives": 25091,
            "False Negatives": 36705,
            "Average Precision": 0.89,
            "ROC AUC": 0.89,
        },
        {
            "Model": "8.7 - LSTM",
            "True Positives": 136020,
            "True Negatives": 127477,
            "False Positives": 32523,
            "False Negatives": 23980,
            "Average Precision": 0.90,
            "ROC AUC": 0.90,
        },
        {
            "Model": "8.8 - Bidirectional-LSTM",
            "True Positives": 130727,
            "True Negatives": 133962,
            "False Positives": 26038,
            "False Negatives": 29273,
            "Average Precision": 0.91,
            "ROC AUC": 0.91,
        },
        {
            "Model": "8.9 - Stacked Bidirectional-LSTM",
            "True Positives": 130822,
            "True Negatives": 133962,
            "False Positives": 26038,
            "False Negatives": 29178,
            "Average Precision": 0.91,
            "ROC AUC": 0.91,
        },
    ]
)

models_results_df["Accuracy"] = (
    models_results_df["True Positives"] + models_results_df["True Negatives"]
) / (
    models_results_df["True Positives"]
    + models_results_df["True Negatives"]
    + models_results_df["False Positives"]
    + models_results_df["False Negatives"]
)

models_results_df["F1"] = (
    2
    * models_results_df["True Positives"]
    / (
        2 * models_results_df["True Positives"]
        + models_results_df["False Positives"]
        + models_results_df["False Negatives"]
    )
)

models_results_df["Precision"] = models_results_df["True Positives"] / (
    models_results_df["True Positives"] + models_results_df["False Positives"]
)

models_results_df["Recall"] = models_results_df["True Positives"] / (
    models_results_df["True Positives"] + models_results_df["False Negatives"]
)

models_results_df["Sensitivity"] = models_results_df["True Positives"] / (
    models_results_df["True Positives"] + models_results_df["False Negatives"]
)

models_results_df["Specificity"] = models_results_df["True Negatives"] / (
    models_results_df["True Negatives"] + models_results_df["False Positives"]
)

for metrics in [
    ("Average Precision", "ROC AUC"),
    ("Accuracy", "F1"),
    ("Precision", "Recall"),
    ("Sensitivity", "Specificity"),
]:
    fig = px.scatter(
        models_results_df,
        x=metrics[0],
        y=metrics[1],
        color="Model",
        hover_name="Model",
    )
    fig.update_traces(marker_size=15)
    fig.show()

	id	date	flag	user	text
0	1467810369	Mon Apr 06 22:19:45 PDT 2009	NO_QUERY	_TheSpecialOne_	@switchfoot http://twitpic.com/2y1zl - Awww, t...
1	1467810672	Mon Apr 06 22:19:49 PDT 2009	NO_QUERY	scotthamilton	is upset that he can't update his Facebook by ...
2	1467810917	Mon Apr 06 22:19:53 PDT 2009	NO_QUERY	mattycus	@Kenichan I dived many times for the ball. Man...
3	1467811184	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	ElleCTF	my whole body feels itchy and like its on fire
4	1467811193	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	Karoli	@nationwideclass no, it's not behaving at all....

Air Paradis : Detect bad buzz with deep learning¶

Context¶

Project modules¶

Exploratory data analysis (EDA)¶

Load data¶

Explore data¶

Text analysis¶

Models comparison¶

Raw metrics¶

Observations¶

Best model¶

Pros¶

Cons¶

Off-the-shelf models (cloud or pre-trained)¶

Pros¶

Cons¶

Fine-tuned BERT model¶

Pros¶

Cons¶

Custom Neural Networks¶

Pros¶

Cons¶

	target	text
count	1600000	1600000
unique	2	1581466
top	NEGATIVE	isPlayer Has Died! Sorry
freq	800000	210