# Import custom helper libraries
import os
import sys

# Maths modules
import pandas as pd


src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.helpers as data_helpers
import visualization.helpers as viz_helpers


# Load environment variables from .env file
from dotenv import load_dotenv

load_dotenv()
AZURE_TEXT_ANALYTICS_KEY = os.getenv("AZURE_TEXT_ANALYTICS_KEY")
AZURE_TEXT_ANALYTICS_ENDPOINT = os.getenv("AZURE_TEXT_ANALYTICS_ENDPOINT")

# Sample data for development
TEXT_SAMPLE_SIZE = 2000  # <= 0 for all


# Download and unzip CSV files
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.


# Load data from CSV
df = pd.read_csv(
    os.path.join("..", "data", "raw", "training.1600000.processed.noemoticon.csv"),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)

# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target = df.target.map(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    }
)

# Sample data
if TEXT_SAMPLE_SIZE > 0:
    df = data_helpers.balance_sample(df, "target", TEXT_SAMPLE_SIZE)

df.describe()


from models.custom_azure_text_analysis_classifier import (
    CustomAzureTextAnalyticsClassifier,
)


# Initialize Azure Text Analytics classifier
cls = CustomAzureTextAnalyticsClassifier(
    endpoint=AZURE_TEXT_ANALYTICS_ENDPOINT, key=AZURE_TEXT_ANALYTICS_KEY
)

cache_json_path = os.path.join("..", "results", "azure_cache.json")
if os.path.exists(cache_json_path):
    # Load cached results
    cls.load_cache_json(filename=cache_json_path)
else:
    # Compute sentiment scores
    cls.fit(X=df.text.values, y=df.target.values)
    # Save results to cache
    cls.save_cache_json(filename=cache_json_path)

# Plot classification performances
viz_helpers.plot_classifier_results(
    cls,
    df.text.values,
    df.target.values,
    title="Classification results",
)


df["api_positive"] = [cls.cache[text]["positive"] for text in df.text.values]
df["api_neutral"] = [cls.cache[text]["neutral"] for text in df.text.values]
df["api_negative"] = [cls.cache[text]["negative"] for text in df.text.values]

viz_helpers.plot_boxes(
    dataframe=df,
    plot_columns=["api_positive", "api_neutral", "api_negative"],
    categorical_column="target",
)


from sklearn.model_selection import train_test_split


X = df[["api_positive", "api_neutral", "api_negative"]]
y = df.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)


from sklearn.linear_model import LogisticRegressionCV


# Define model
model = LogisticRegressionCV(random_state=42)

# Train model
model.fit(X_train, y_train)

LogisticRegressionCV(random_state=42)


viz_helpers.plot_classifier_results(
    model,
    X_train,
    y_train,
    title="Train set results",
)


viz_helpers.plot_classifier_results(
    model,
    X_test,
    y_test,
    title="Test set results",
)


# Compute predictions
y_pred = model.predict(X)
df["prediction"] = y_pred


import shap

shap.initjs()

explainer = shap.Explainer(model, X_train, feature_names=X.columns)
shap_values = explainer(X)


# False positive example
fp_index = df[(df.target == "NEGATIVE") & (df.prediction == "POSITIVE")].index[0]
fp_text = df.text.values[fp_index]

print(fp_text)

shap.plots.force(shap_values[fp_index])

A great hard training weekend is over.  a couple days of rest and lets do it again!  Lots of computer time to put in now


# False negative example
fn_index = df[(df.target == "POSITIVE") & (df.prediction == "NEGATIVE")].index[0]
fn_text = df.text.values[fn_index]

print(fn_text)

shap.plots.force(shap_values[fn_index])

Is lookin 4ward to a long weekend  really dont want to go to work 2day tho =[ x

Azure Cognitive Services : Text Analytics API¶

Load project modules and data¶

Classification Models¶

Basic model from Azure Cognitive Service for Language¶

Logistic Regression model based on Azure Cognitive Service for Language¶

Models comparison¶

	target	text
count	2000	2000
unique	2	2000
top	NEGATIVE	@xnausikaax oh no! where did u order from? tha...
freq	1000	1