import pickle

# Import custom helper libraries
import os
import sys

src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import features.helpers as feat_helpers
import data.helpers as data_helpers
import visualization.helpers as viz_helpers

# Maths modules
from scipy.stats import f_oneway
import pandas as pd

# Viz modules
import plotly.express as px

# Render for export
import plotly.io as pio

pio.renderers.default = "notebook"


# Download and unzip CSV files
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.


# Load data from CSV
df = pd.read_csv(
    os.path.join("..", "data", "raw", "training.1600000.processed.noemoticon.csv"),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)


# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target = df.target.map(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    }
)


# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenizers, Stemmers and Lemmatizers
import nltk
from nltk.corpus import stopwords
import spacy

# Download resources
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

# Download SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# Define tokenizer
tokenizer = lambda text: [  # SpaCy Lemmatizer
    token.lemma_.lower() for token in nlp(text) if token.is_alpha and not token.is_stop
]

2022-02-05 09:58:44.948434: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-05 09:58:44.948457: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/clement/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(
    processed_data_path, "tfidf_spacy_dataset.pkl"
)
vocabulary_file_path = os.path.join(processed_data_path, "tfidf_spacy_vocabulary.pkl")

if os.path.exists(vectorized_dataset_file_path) and os.path.exists(
    vocabulary_file_path
):
    # Load vectorized dataset
    with (open(vectorized_dataset_file_path, "rb")) as f:
        X = pickle.load(f)
    # Load vocabulary
    with (open(vocabulary_file_path, "rb")) as f:
        vocabulary = pickle.load(f)
else:
    # Define vectorizer
    vectorizer = TfidfVectorizer(
        strip_accents="unicode",
        lowercase=True,
        stop_words=stopwords,
        tokenizer=tokenizer,
    )

    # Vectorize text
    X = vectorizer.fit_transform(df.text)

    # Get vocabulary
    vocabulary = vectorizer.get_feature_names_out()

    # Save vectorized dataset as pickle
    with open(vectorized_dataset_file_path, "wb") as f:
        pickle.dump(X, f)

    # Save vocabulary as pickle
    with open(vocabulary_file_path, "wb") as f:
        pickle.dump(vocabulary, f)


from sklearn.decomposition import TruncatedSVD


# Train LSA model
n_components = 50
lsa = TruncatedSVD(n_components=n_components, random_state=42).fit(X)


# Plot explained variance ratio of LSA
fig = px.line(
    x=range(1, n_components + 1),
    y=lsa.explained_variance_ratio_,
    title="Explained variance ratio of LSA",
    labels={"x": "Component", "y": "Explained variance ratio"},
    markers=True,
)
fig.show()


# Reduce dimensionality
X_lsa = lsa.transform(X)

X_lsa.shape

(1600000, 50)


# Plot the top words of each topic
viz_helpers.plot_top_words(
    model=lsa,
    feature_names=vocabulary,
    n_top_words=10,
    n_topics=50,
    title=f"LSA Topics / n_components={n_components}",
)


from sklearn.model_selection import train_test_split


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_lsa,
    df.target,
    test_size=0.2,
    stratify=df.target,
    random_state=42,
)


from sklearn.linear_model import LogisticRegressionCV


# Define model
model = LogisticRegressionCV(random_state=42)

# Train model
model.fit(X_train, y_train)

LogisticRegressionCV(random_state=42)


# Comute the coefficients
topics = [f"Topic {i + 1}" for i in range(n_components)]
coefs = pd.Series(
    model.coef_[0],
    index=topics,
)

# Top 20 topics by importance (positive and negative)
top_20_coefs = coefs.nlargest(10).append(coefs.nsmallest(10)).sort_values()

# Plot top 20 topics by importance (positive and negative)
fig = px.bar(
    top_20_coefs,
    x=top_20_coefs.index,
    y=top_20_coefs.values,
    labels={"x": "Topic", "y": "Importance", "color": "Importance"},
    title=f"Top 20 important topics",
    color=top_20_coefs.values,
)
fig.show()

/tmp/ipykernel_882346/3856082245.py:9: FutureWarning:

The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


viz_helpers.plot_classifier_results(
    model,
    X_train,
    y_train,
    title="Train set results",
)


viz_helpers.plot_classifier_results(
    model,
    X_test,
    y_test,
    title="Test set results",
)


# Compute predictions
y_pred = model.predict(X_lsa)
df["prediction"] = y_pred


import shap

shap.initjs()

explainer = shap.Explainer(model, X_train, feature_names=topics)
shap_values = explainer(X_lsa)

Linear explainer: 1600001it [00:20, 36089.05it/s]


# False positive example
fp_index = df[(df.target == "NEGATIVE") & (df.prediction == "POSITIVE")].index[0]
fp_text = df.text.values[fp_index]

print(fp_text)

shap.plots.force(shap_values[fp_index])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D


# False negative example
fn_index = df[(df.target == "POSITIVE") & (df.prediction == "NEGATIVE")].index[0]
fn_text = df.text.values[fn_index]

print(fn_text)

shap.plots.force(shap_values[fn_index])

Being sick can be really cheap when it hurts too much to eat real food  Plus, your friends make you soup

Baseline Model : Logistic Regression¶

Load project modules¶

Text pre-processing¶

Classification model¶

Dimension reduction & Topic modeling¶

Train and test the model¶