In this notebook, we will use the Keras library to implement different Artificial Neural Network (ANN) models.
We will compare these models to the baseline model from 1_baseline.ipynb.
We will use basic python packages, as well as Tensorflow and Keras
to build our Neural Network models.
import pickle
from tqdm import tqdm
# Import custom helper libraries
import os
import sys
src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
sys.path.append(src_path)
import data.helpers as data_helpers
import visualization.helpers as viz_helpers
# Maths modules
import numpy as np
import pandas as pd
import tensorflow as tf
# Render for export
import plotly.io as pio
pio.renderers.default = "notebook"
2022-02-02 12:21:49.261031: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory 2022-02-02 12:21:49.261062: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
# Download and unzip CSV files
!cd .. && make dataset && cd notebooks
>>> Downloading and extracting data files... Data files already downloaded. >>> OK.
# Load data from CSV
df = pd.read_csv(
os.path.join("..", "data", "raw", "training.1600000.processed.noemoticon.csv"),
names=["target", "id", "date", "flag", "user", "text"],
)
# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)
# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)
# Replace target values with labels
df.target = df.target.map(
{
0: "NEGATIVE",
2: "NEUTRAL",
4: "POSITIVE",
}
)
# Binarize target
df.target = df.target.map(
{
"NEGATIVE": 0,
"POSITIVE": 1,
}
)
df.describe()
target | |
---|---|
count | 1600000.0 |
mean | 0.5 |
std | 0.5 |
min | 0.0 |
25% | 0.0 |
50% | 0.5 |
75% | 1.0 |
max | 1.0 |
In this model, we will use a simple word count vector as our input, with no text preprocessing. The Neural Network architecture will be a simple feed forward network with two hidden layer.
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
df.text,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Input, TextVectorization, Dense, Dropout, Activation
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
model_name = "ffnn_on_count"
vocabulary_size = 10000
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# Define vectorizer
vectorize_layer = TextVectorization(
output_mode="count",
max_tokens=vocabulary_size,
pad_to_max_tokens=True,
)
vectorize_layer.adapt(
df.text,
batch_size=128,
)
# define NN model
model = Sequential(name=model_name)
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Dense(100, input_shape=(vocabulary_size,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Model: "ffnn" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= text_vectorization_3 (TextV (None, 10000) 0 ectorization) dense (Dense) (None, 100) 1000100 dropout (Dropout) (None, 100) 0 dense_1 (Dense) (None, 10) 1010 dense_2 (Dense) (None, 1) 11 ================================================================= Total params: 1,001,121 Trainable params: 1,001,121 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train dataset is better than on the test datasets which indicates that our model has slightly over-fitted.
The performances on the dataset are much better than our baseline model :
Our model is very well balanced : it predicted only 0.6% (baseline = 35% , -98%) more NEGATIVE (160499) messages than POSITIVE (159501).
# Tokenizers, Stemmers and Lemmatizers
import spacy
# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(
processed_data_path, "spacy_vectorized_dataset.pkl"
)
if os.path.exists(vectorized_dataset_file_path):
# Load vectorized dataset
with (open(vectorized_dataset_file_path, "rb")) as f:
X = pickle.load(f)
else:
# Download SpaCy model
try:
nlp = spacy.load("en_core_web_lg")
except:
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
# Encode text
X = [nlp(doc).vector for doc in tqdm(df.text)]
# Save vectorized dataset as pickle
with open(vectorized_dataset_file_path, "wb") as f:
pickle.dump(X, f)
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
import numpy as np
import tensorflow as tf
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
model_name = "ffnn_on_spacy"
vector_size = len(X[0])
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# define NN model
model = Sequential(name=model_name)
model.add(Dense(100, input_shape=(vector_size,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
np.stack(X_train, axis=0),
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
2022-01-17 04:37:26.406166: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory 2022-01-17 04:37:26.406207: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303) 2022-01-17 04:37:26.406234: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gros-bernard): /proc/driver/nvidia/version does not exist 2022-01-17 04:37:26.406747: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Model: "ffnn_on_spacy" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 100) 30100 dropout (Dropout) (None, 100) 0 dense_1 (Dense) (None, 10) 1010 dense_2 (Dense) (None, 1) 11 ================================================================= Total params: 31,121 Trainable params: 31,121 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
np.stack(X_train, axis=0),
batch_size=128,
workers=4,
use_multiprocessing=True,
verbose=1,
)
2022-01-17 04:37:36.980418: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1536000000 exceeds 10% of free system memory.
10000/10000 [==============================] - 8s 760us/step
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
np.stack(X_test, axis=0),
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train and test datasets are similar which indicates that our model has not over-fitted.
The performances on the test dataset are similar to our previous model (slightly worse). The model is still balanced, but slightly less than before.
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.utils import simple_preprocess
# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(
processed_data_path, "doc2vec_vectorized_dataset.pkl"
)
if os.path.exists(vectorized_dataset_file_path):
# Load vectorized dataset
with (open(vectorized_dataset_file_path, "rb")) as f:
X = pickle.load(f)
else:
# Tag documents for training
corpus = [
TaggedDocument(words=simple_preprocess(doc), tags=[i])
for i, doc in enumerate(df.text)
]
# Train doc2vec model
doc2vec = Doc2Vec()
doc2vec.build_vocab(corpus)
doc2vec.train(corpus, total_examples=doc2vec.corpus_count, epochs=doc2vec.epochs)
# Vectorize text
X = [doc2vec.infer_vector(doc.words) for doc in corpus]
# Save vectorized dataset as pickle
with open(vectorized_dataset_file_path, "wb") as f:
pickle.dump(X, f)
1247245it [11:20, 1833.20it/s]
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Embedding,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
model_name = "ffnn_on_doc2vec"
vector_size = len(X[0])
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# define NN model
model = Sequential(name=model_name)
model.add(Dense(100, input_shape=(vector_size,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
np.stack(X_train, axis=0),
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
2022-02-02 12:58:48.487427: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory 2022-02-02 12:58:48.490233: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303) 2022-02-02 12:58:48.495825: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gros-bernard): /proc/driver/nvidia/version does not exist 2022-02-02 12:58:48.526015: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2022-02-02 12:58:51.613540: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 409600000 exceeds 10% of free system memory.
Epoch 1/10 8000/8000 [==============================] - 18s 2ms/step - loss: 0.5611 - accuracy: 0.7120 - ROC_AUC: 0.7832 - AP: 0.7717 - val_loss: 0.5445 - val_accuracy: 0.7227 - val_ROC_AUC: 0.8002 - val_AP: 0.7911 Epoch 2/10 8000/8000 [==============================] - 14s 2ms/step - loss: 0.5414 - accuracy: 0.7254 - ROC_AUC: 0.8013 - AP: 0.7923 - val_loss: 0.5355 - val_accuracy: 0.7284 - val_ROC_AUC: 0.8076 - val_AP: 0.8019 Epoch 3/10 8000/8000 [==============================] - 13s 2ms/step - loss: 0.5345 - accuracy: 0.7298 - ROC_AUC: 0.8070 - AP: 0.7997 - val_loss: 0.5305 - val_accuracy: 0.7320 - val_ROC_AUC: 0.8122 - val_AP: 0.8077 Epoch 4/10 8000/8000 [==============================] - 13s 2ms/step - loss: 0.5313 - accuracy: 0.7317 - ROC_AUC: 0.8097 - AP: 0.8035 - val_loss: 0.5260 - val_accuracy: 0.7366 - val_ROC_AUC: 0.8146 - val_AP: 0.8095 Epoch 5/10 8000/8000 [==============================] - 14s 2ms/step - loss: 0.5292 - accuracy: 0.7331 - ROC_AUC: 0.8114 - AP: 0.8057 - val_loss: 0.5247 - val_accuracy: 0.7369 - val_ROC_AUC: 0.8155 - val_AP: 0.8112 Epoch 6/10 8000/8000 [==============================] - 12s 2ms/step - loss: 0.5276 - accuracy: 0.7343 - ROC_AUC: 0.8127 - AP: 0.8074 - val_loss: 0.5238 - val_accuracy: 0.7369 - val_ROC_AUC: 0.8164 - val_AP: 0.8124 Epoch 7/10 8000/8000 [==============================] - 12s 2ms/step - loss: 0.5262 - accuracy: 0.7351 - ROC_AUC: 0.8139 - AP: 0.8088 - val_loss: 0.5219 - val_accuracy: 0.7392 - val_ROC_AUC: 0.8178 - val_AP: 0.8138 Epoch 8/10 8000/8000 [==============================] - 13s 2ms/step - loss: 0.5251 - accuracy: 0.7362 - ROC_AUC: 0.8149 - AP: 0.8098 - val_loss: 0.5230 - val_accuracy: 0.7378 - val_ROC_AUC: 0.8170 - val_AP: 0.8130 Epoch 9/10 8000/8000 [==============================] - 17s 2ms/step - loss: 0.5246 - accuracy: 0.7366 - ROC_AUC: 0.8153 - AP: 0.8103 - val_loss: 0.5212 - val_accuracy: 0.7389 - val_ROC_AUC: 0.8187 - val_AP: 0.8149 Epoch 10/10 8000/8000 [==============================] - 13s 2ms/step - loss: 0.5238 - accuracy: 0.7371 - ROC_AUC: 0.8159 - AP: 0.8110 - val_loss: 0.5200 - val_accuracy: 0.7403 - val_ROC_AUC: 0.8194 - val_AP: 0.8154
2022-02-02 13:01:10.661569: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
INFO:tensorflow:Assets written to: ../results/ffnn_on_doc2vec/assets
INFO:tensorflow:Assets written to: ../results/ffnn_on_doc2vec/assets
Model: "ffnn_on_doc2vec" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 100) 10100 dropout (Dropout) (None, 100) 0 dense_1 (Dense) (None, 10) 1010 dense_2 (Dense) (None, 1) 11 ================================================================= Total params: 11,121 Trainable params: 11,121 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
np.stack(X_train, axis=0),
batch_size=128,
workers=4,
use_multiprocessing=True,
verbose=1,
)
40/10000 [..............................] - ETA: 12s
2022-02-02 13:01:14.316397: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 512000000 exceeds 10% of free system memory.
10000/10000 [==============================] - 12s 1ms/step
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
np.stack(X_test, axis=0),
batch_size=128,
workers=4,
use_multiprocessing=True,
)
2022-02-02 13:01:31.154682: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 128000000 exceeds 10% of free system memory.
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train and test datasets are similar which indicates that our model has not over-fitted.
The performances on the dataset are better than our baseline model, but not as good as our previous models :
Our model is quite well balanced : it predicted only 3.2% (baseline = 35% , -91%) more POSITIVE (162531) messages than NEGATIVE (157469).
In the following models, we will add an embedding layer to our architecture. The goal of such layers is to represent words as vectors so that similar words have similar vectors, and that words meaning relations are kept (eg. : "Paris" is to "France" what "London" is to "England").
The input of an embedding layer is an encoded version of the text data. As a text pre-processing, we are going to test two encoding methods.
The Neural Network architecture after the embedding layer remains same as before.
In this model, we will use a basic encoding method : each word is converted to a number.
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
df.text,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Flatten,
Embedding,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
# Model constants.
model_name = "ffnn_embedding_on_encoded"
max_features = 10000
embedding_dim = 100
sequence_length = 30
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# Define vectorizer
vectorize_layer = TextVectorization(
output_mode="int",
max_tokens=max_features,
output_sequence_length=sequence_length,
)
vectorize_layer.adapt(
df.text,
batch_size=128,
)
# define NN model
model = Sequential(name=model_name)
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
# Embedding layer
model.add(
Embedding(
max_features,
embedding_dim,
input_length=sequence_length,
)
)
model.add(Flatten())
# Dense layers
model.add(Dense(100, input_shape=(max_features,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
# Classification layer
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Epoch 1/10 8000/8000 [==============================] - 181s 22ms/step - loss: 0.4397 - accuracy: 0.7939 - ROC_AUC: 0.8769 - AP: 0.8775 - val_loss: 0.4164 - val_accuracy: 0.8079 - val_ROC_AUC: 0.8907 - val_AP: 0.8923 Epoch 2/10 8000/8000 [==============================] - 174s 22ms/step - loss: 0.3987 - accuracy: 0.8175 - ROC_AUC: 0.9003 - AP: 0.9016 - val_loss: 0.4116 - val_accuracy: 0.8114 - val_ROC_AUC: 0.8934 - val_AP: 0.8945 Epoch 3/10 8000/8000 [==============================] - 171s 21ms/step - loss: 0.3654 - accuracy: 0.8346 - ROC_AUC: 0.9169 - AP: 0.9187 - val_loss: 0.4259 - val_accuracy: 0.8089 - val_ROC_AUC: 0.8896 - val_AP: 0.8887 Epoch 4/10 8000/8000 [==============================] - 161s 20ms/step - loss: 0.3304 - accuracy: 0.8525 - ROC_AUC: 0.9325 - AP: 0.9345 - val_loss: 0.4607 - val_accuracy: 0.8038 - val_ROC_AUC: 0.8828 - val_AP: 0.8799 INFO:tensorflow:Assets written to: ../results/ffnn_embedding_on_encoded/assets
INFO:tensorflow:Assets written to: ../results/ffnn_embedding_on_encoded/assets
Model: "ffnn_embedding_on_encoded" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= text_vectorization_1 (TextV (None, 30) 0 ectorization) embedding_2 (Embedding) (None, 30, 100) 1000000 flatten_2 (Flatten) (None, 3000) 0 dense_6 (Dense) (None, 100) 300100 dropout_2 (Dropout) (None, 100) 0 dense_7 (Dense) (None, 10) 1010 dense_8 (Dense) (None, 1) 11 ================================================================= Total params: 1,301,121 Trainable params: 1,301,121 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train dataset is better than on the test datasets which indicates that our model has slightly over-fitted.
The performance on the test dataset is similar to our previous models.
This model is also more biased than the previous ones.
# Tokenizers, Stemmers and Lemmatizers
from transformers import BertTokenizerFast
# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
encoded_dataset_file_path = os.path.join(
processed_data_path, "bert_encoded_dataset.pkl"
)
if os.path.exists(encoded_dataset_file_path):
# Load encoded dataset
with (open(encoded_dataset_file_path, "rb")) as f:
X = pickle.load(f)
else:
bert = BertTokenizerFast.from_pretrained("bert-base-uncased")
# Encode text
X = [bert.encode(doc) for doc in tqdm(df.text)]
# Save vectorized dataset as pickle
with open(encoded_dataset_file_path, "wb") as f:
pickle.dump(X, f)
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
# Pad sequences
X = pad_sequences(X, maxlen=max(map(len, X)), padding="post")
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Flatten,
Embedding,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
# Model constants.
model_name = "ffnn_embedding_on_bert"
embedding_dim = 100
max_features = max(map(max, X)) + 1
sequence_length = max(map(len, X))
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# define NN model
model = Sequential(name=model_name)
# Embedding layer
model.add(
Embedding(
max_features,
embedding_dim,
input_length=sequence_length,
)
)
model.add(Flatten())
# Dense layers
model.add(Dense(100, input_shape=(max_features,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
# Classification layer
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Model: "ffnn_embedding_on_bert" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_1 (Embedding) (None, 164, 100) 3026600 flatten_1 (Flatten) (None, 16400) 0 dense_3 (Dense) (None, 100) 1640100 dropout_1 (Dropout) (None, 100) 0 dense_4 (Dense) (None, 10) 1010 dense_5 (Dense) (None, 1) 11 ================================================================= Total params: 4,667,721 Trainable params: 4,667,721 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
2022-01-17 10:44:54.594106: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 839680000 exceeds 10% of free system memory.
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train dataset is better than on the test datasets which indicates that our model has slightly over-fitted.
The performance on the test dataset is similar to our previous models.
One good thing : this model is almost perfectly balanced ! It predicted only 10 more NEGATIVE (160005) messages than POSITIVE (159995).
In the following models, we will add RNN or (Bidirectional-)LSTM layers to our architecture. The goal of such layers is to keep the information of the previous steps of the network when we are processing the next step.
The Neural Network architecture after the Recurrent layer remains same as before (with the Embedding layer).
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
df.text,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Flatten,
Embedding,
SimpleRNN,
GRU,
LSTM,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
# Model constants.
model_name = "rnn_on_embedded"
max_features = 10000
sequence_length = 30
embedding_dim = 100
rnn_units = 100
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# Define vectorizer
vectorize_layer = TextVectorization(
output_mode="int",
max_tokens=max_features,
output_sequence_length=sequence_length,
)
vectorize_layer.adapt(
df.text,
batch_size=128,
)
# define NN model
model = Sequential(name=model_name)
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
# Embedding layer
model.add(
Embedding(
max_features,
embedding_dim,
input_length=sequence_length,
)
)
# RNN layer
model.add(SimpleRNN(units=rnn_units, dropout=0.2))
# Dense layers
model.add(Dense(100, input_shape=(max_features,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
# Classification layer
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Epoch 1/10 8000/8000 [==============================] - 233s 29ms/step - loss: 0.4615 - accuracy: 0.7854 - ROC_AUC: 0.8638 - AP: 0.8611 - val_loss: 0.4442 - val_accuracy: 0.7971 - val_ROC_AUC: 0.8805 - val_AP: 0.8809 Epoch 2/10 8000/8000 [==============================] - 222s 28ms/step - loss: 0.4321 - accuracy: 0.8032 - ROC_AUC: 0.8822 - AP: 0.8793 - val_loss: 0.4262 - val_accuracy: 0.8076 - val_ROC_AUC: 0.8878 - val_AP: 0.8873 Epoch 3/10 8000/8000 [==============================] - 212s 26ms/step - loss: 0.4211 - accuracy: 0.8086 - ROC_AUC: 0.8886 - AP: 0.8859 - val_loss: 0.4193 - val_accuracy: 0.8053 - val_ROC_AUC: 0.8897 - val_AP: 0.8891 Epoch 4/10 8000/8000 [==============================] - 203s 25ms/step - loss: 0.4153 - accuracy: 0.8125 - ROC_AUC: 0.8919 - AP: 0.8891 - val_loss: 0.4218 - val_accuracy: 0.8086 - val_ROC_AUC: 0.8891 - val_AP: 0.8853 Epoch 5/10 8000/8000 [==============================] - 207s 26ms/step - loss: 0.4135 - accuracy: 0.8136 - ROC_AUC: 0.8929 - AP: 0.8896 - val_loss: 0.4208 - val_accuracy: 0.8070 - val_ROC_AUC: 0.8900 - val_AP: 0.8871 INFO:tensorflow:Assets written to: ../results/rnn_on_encoded/assets
INFO:tensorflow:Assets written to: ../results/rnn_on_encoded/assets
Model: "rnn_on_encoded" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= text_vectorization_4 (TextV (None, 30) 0 ectorization) embedding_3 (Embedding) (None, 30, 100) 1000000 simple_rnn_2 (SimpleRNN) (None, 100) 20100 dense_9 (Dense) (None, 100) 10100 dropout_3 (Dropout) (None, 100) 0 dense_10 (Dense) (None, 10) 1010 dense_11 (Dense) (None, 1) 11 ================================================================= Total params: 1,031,221 Trainable params: 1,031,221 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train and test datasets are similar which indicates that our model has not over-fitted.
The performances on the dataset are slightly better than our previous models :
Our model is much less biased than our baseline, but still leans towards the NEGATIVE class : it predicted 16% (baseline = 35% , -54%) more NEGATIVE (171614) messages than POSITIVE (148386).
In this model, we add a LSTM (Long short-term memory) layer in stead of the RNN layer.
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
df.text,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Flatten,
Embedding,
SimpleRNN,
GRU,
LSTM,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
# Model constants.
model_name = "lstm_on_embedded"
max_features = 10000
sequence_length = 30
embedding_dim = 100
rnn_units = 100
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# Define vectorizer
vectorize_layer = TextVectorization(
output_mode="int",
max_tokens=max_features,
output_sequence_length=sequence_length,
)
vectorize_layer.adapt(
df.text,
batch_size=128,
)
# define NN model
model = Sequential(name=model_name)
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
# Embedding layer
model.add(
Embedding(
max_features,
embedding_dim,
input_length=sequence_length,
)
)
# LSTM layer
model.add(LSTM(units=rnn_units, dropout=0.2))
# Dense layers
model.add(Dense(100, input_shape=(max_features,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
# Classification layer
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Epoch 1/10 8000/8000 [==============================] - 438s 54ms/step - loss: 0.4339 - accuracy: 0.7981 - ROC_AUC: 0.8807 - AP: 0.8797 - val_loss: 0.4073 - val_accuracy: 0.8138 - val_ROC_AUC: 0.8978 - val_AP: 0.8985 Epoch 2/10 8000/8000 [==============================] - 424s 53ms/step - loss: 0.3955 - accuracy: 0.8205 - ROC_AUC: 0.9022 - AP: 0.9022 - val_loss: 0.3905 - val_accuracy: 0.8212 - val_ROC_AUC: 0.9051 - val_AP: 0.9059 Epoch 3/10 8000/8000 [==============================] - 403s 50ms/step - loss: 0.3778 - accuracy: 0.8297 - ROC_AUC: 0.9111 - AP: 0.9117 - val_loss: 0.3868 - val_accuracy: 0.8242 - val_ROC_AUC: 0.9073 - val_AP: 0.9081 Epoch 4/10 8000/8000 [==============================] - 405s 51ms/step - loss: 0.3638 - accuracy: 0.8370 - ROC_AUC: 0.9179 - AP: 0.9186 - val_loss: 0.3860 - val_accuracy: 0.8261 - val_ROC_AUC: 0.9083 - val_AP: 0.9092 Epoch 5/10 8000/8000 [==============================] - 448s 56ms/step - loss: 0.3518 - accuracy: 0.8432 - ROC_AUC: 0.9234 - AP: 0.9244 - val_loss: 0.3870 - val_accuracy: 0.8265 - val_ROC_AUC: 0.9076 - val_AP: 0.9082 Epoch 6/10 8000/8000 [==============================] - 444s 55ms/step - loss: 0.3410 - accuracy: 0.8489 - ROC_AUC: 0.9282 - AP: 0.9291 - val_loss: 0.3857 - val_accuracy: 0.8277 - val_ROC_AUC: 0.9081 - val_AP: 0.9079 Epoch 7/10 8000/8000 [==============================] - 410s 51ms/step - loss: 0.3313 - accuracy: 0.8537 - ROC_AUC: 0.9323 - AP: 0.9333 - val_loss: 0.3938 - val_accuracy: 0.8255 - val_ROC_AUC: 0.9067 - val_AP: 0.9065 Epoch 8/10 8000/8000 [==============================] - 408s 51ms/step - loss: 0.3223 - accuracy: 0.8583 - ROC_AUC: 0.9361 - AP: 0.9371 - val_loss: 0.3955 - val_accuracy: 0.8240 - val_ROC_AUC: 0.9049 - val_AP: 0.9037
WARNING:absl:Found untraced functions such as lstm_cell_layer_call_fn, lstm_cell_layer_call_and_return_conditional_losses, lstm_cell_layer_call_fn, lstm_cell_layer_call_and_return_conditional_losses, lstm_cell_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: ../results/lstm_on_embedded/assets
INFO:tensorflow:Assets written to: ../results/lstm_on_embedded/assets WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7eebf2b0> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function.
Model: "lstm_on_embedded" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= text_vectorization_6 (TextV (None, 30) 0 ectorization) embedding_5 (Embedding) (None, 30, 100) 1000000 lstm (LSTM) (None, 100) 80400 dense_15 (Dense) (None, 100) 10100 dropout_5 (Dropout) (None, 100) 0 dense_16 (Dense) (None, 10) 1010 dense_17 (Dense) (None, 1) 11 ================================================================= Total params: 1,091,521 Trainable params: 1,091,521 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train dataset is better than on the test datasets which indicates that our model has slightly over-fitted.
The performances on the dataset are slightly better than our previous models :
Our model is much less biased than our baseline, but still leans towards the POSITIVE class : it predicted 11% (baseline = 35% , -69%) more POSITIVE (168543) messages than NEGATIVE (151457).
In this model, we add a Bidirectional-LSTM layer in stead of the RNN layer. The goal of such layers is to use information from past (backwards) and future (forward) states simultaneously.
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
df.text,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Flatten,
Embedding,
Bidirectional,
SimpleRNN,
GRU,
LSTM,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
# Model constants.
model_name = "bidirectional_lstm_on_embedded"
max_features = 10000
sequence_length = 30
embedding_dim = 100
rnn_units = 100
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# Define vectorizer
vectorize_layer = TextVectorization(
output_mode="int",
max_tokens=max_features,
output_sequence_length=sequence_length,
)
vectorize_layer.adapt(
df.text,
batch_size=128,
)
# define NN model
model = Sequential(name=model_name)
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
# Embedding layer
model.add(
Embedding(
max_features,
embedding_dim,
input_length=sequence_length,
)
)
# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=rnn_units, dropout=0.2)))
# Dense layers
model.add(Dense(100, input_shape=(max_features,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
# Classification layer
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Epoch 1/10 8000/8000 [==============================] - 678s 84ms/step - loss: 0.4264 - accuracy: 0.8027 - ROC_AUC: 0.8849 - AP: 0.8848 - val_loss: 0.3973 - val_accuracy: 0.8180 - val_ROC_AUC: 0.9015 - val_AP: 0.9027 Epoch 2/10 8000/8000 [==============================] - 723s 90ms/step - loss: 0.3885 - accuracy: 0.8236 - ROC_AUC: 0.9057 - AP: 0.9063 - val_loss: 0.3861 - val_accuracy: 0.8254 - val_ROC_AUC: 0.9074 - val_AP: 0.9085 Epoch 3/10 8000/8000 [==============================] - 796s 100ms/step - loss: 0.3711 - accuracy: 0.8331 - ROC_AUC: 0.9143 - AP: 0.9152 - val_loss: 0.3819 - val_accuracy: 0.8276 - val_ROC_AUC: 0.9093 - val_AP: 0.9103 Epoch 4/10 8000/8000 [==============================] - 860s 108ms/step - loss: 0.3563 - accuracy: 0.8404 - ROC_AUC: 0.9213 - AP: 0.9224 - val_loss: 0.3847 - val_accuracy: 0.8270 - val_ROC_AUC: 0.9095 - val_AP: 0.9104 Epoch 5/10 8000/8000 [==============================] - 640s 80ms/step - loss: 0.3433 - accuracy: 0.8473 - ROC_AUC: 0.9271 - AP: 0.9283 - val_loss: 0.3867 - val_accuracy: 0.8283 - val_ROC_AUC: 0.9092 - val_AP: 0.9096
WARNING:absl:Found untraced functions such as lstm_cell_2_layer_call_fn, lstm_cell_2_layer_call_and_return_conditional_losses, lstm_cell_3_layer_call_fn, lstm_cell_3_layer_call_and_return_conditional_losses, lstm_cell_2_layer_call_fn while saving (showing 5 of 10). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: ../results/bidirectional_lstm_on_embedded/assets
INFO:tensorflow:Assets written to: ../results/bidirectional_lstm_on_embedded/assets WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7b6cee20> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7b6b2c10> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function.
Model: "bidirectional_lstm_on_embedded" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= text_vectorization_7 (TextV (None, 30) 0 ectorization) embedding_6 (Embedding) (None, 30, 100) 1000000 bidirectional (Bidirectiona (None, 200) 160800 l) dense_18 (Dense) (None, 100) 20100 dropout_6 (Dropout) (None, 100) 0 dense_19 (Dense) (None, 10) 1010 dense_20 (Dense) (None, 1) 11 ================================================================= Total params: 1,181,921 Trainable params: 1,181,921 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train dataset is better than on the test datasets which indicates that our model has slightly over-fitted.
The performances on the dataset are slightly better than our previous models :
Our model is much almost not biased biased! It still slightly leans towards the NEGATIVE class : it predicted only 4.1% (baseline = 35% , -88%) more NEGATIVE (163235) messages than POSITIVE (156765).
In this model, we add a second Bidirectional-LSTM layer.
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
df.text,
df.target,
test_size=0.2,
stratify=df.target,
random_state=42,
)
from keras.models import load_model
from keras.models import Sequential
from keras.layers import (
Input,
TextVectorization,
Dense,
Dropout,
Activation,
Flatten,
Embedding,
Bidirectional,
SimpleRNN,
GRU,
LSTM,
)
from keras.callbacks import TensorBoard, EarlyStopping
from keras.metrics import AUC
# Model constants.
model_name = "bidirectional_lstm_with_return_sequences_on_embedded"
max_features = 10000
sequence_length = 30
embedding_dim = 100
rnn_units = 100
results_data_path = os.path.join("..", "results")
model_file_path = os.path.join(results_data_path, model_name)
if os.path.exists(model_file_path):
# Load model
model = load_model(model_file_path)
else:
# Define vectorizer
vectorize_layer = TextVectorization(
output_mode="int",
max_tokens=max_features,
output_sequence_length=sequence_length,
)
vectorize_layer.adapt(
df.text,
batch_size=128,
)
# define NN model
model = Sequential(name=model_name)
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
# Embedding layer
model.add(
Embedding(
max_features,
embedding_dim,
input_length=sequence_length,
)
)
# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=rnn_units, dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(units=rnn_units, dropout=0.2)))
# Dense layers
model.add(Dense(100, input_shape=(max_features,), activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(10, activation="relu"))
# Classification layer
model.add(Dense(1, activation="sigmoid"))
# compile NN network
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=[
"accuracy",
AUC(curve="ROC", name="ROC_AUC"),
AUC(curve="PR", name="AP"),
],
)
# fit NN model
model.fit(
X_train,
y_train,
epochs=10,
batch_size=128,
validation_split=0.2,
callbacks=[
TensorBoard(log_dir=f"logs/{model.name}"),
EarlyStopping(monitor="val_loss", patience=2),
],
workers=4,
use_multiprocessing=True,
)
model.save(model_file_path)
print(model.summary())
Epoch 1/10 8000/8000 [==============================] - 1163s 145ms/step - loss: 0.4258 - accuracy: 0.8024 - ROC_AUC: 0.8852 - AP: 0.8854 - val_loss: 0.4002 - val_accuracy: 0.8156 - val_ROC_AUC: 0.9000 - val_AP: 0.9010 Epoch 2/10 8000/8000 [==============================] - 1150s 144ms/step - loss: 0.3881 - accuracy: 0.8239 - ROC_AUC: 0.9060 - AP: 0.9065 - val_loss: 0.3853 - val_accuracy: 0.8249 - val_ROC_AUC: 0.9075 - val_AP: 0.9087 Epoch 3/10 8000/8000 [==============================] - 1150s 144ms/step - loss: 0.3699 - accuracy: 0.8338 - ROC_AUC: 0.9149 - AP: 0.9158 - val_loss: 0.3809 - val_accuracy: 0.8280 - val_ROC_AUC: 0.9099 - val_AP: 0.9108 Epoch 4/10 8000/8000 [==============================] - 1151s 144ms/step - loss: 0.3551 - accuracy: 0.8419 - ROC_AUC: 0.9219 - AP: 0.9229 - val_loss: 0.3798 - val_accuracy: 0.8293 - val_ROC_AUC: 0.9110 - val_AP: 0.9116 Epoch 5/10 8000/8000 [==============================] - 1152s 144ms/step - loss: 0.3415 - accuracy: 0.8489 - ROC_AUC: 0.9280 - AP: 0.9290 - val_loss: 0.3841 - val_accuracy: 0.8297 - val_ROC_AUC: 0.9107 - val_AP: 0.9113 Epoch 6/10 8000/8000 [==============================] - 1270s 159ms/step - loss: 0.3288 - accuracy: 0.8557 - ROC_AUC: 0.9334 - AP: 0.9346 - val_loss: 0.3890 - val_accuracy: 0.8279 - val_ROC_AUC: 0.9088 - val_AP: 0.9088
WARNING:absl:Found untraced functions such as lstm_cell_5_layer_call_fn, lstm_cell_5_layer_call_and_return_conditional_losses, lstm_cell_6_layer_call_fn, lstm_cell_6_layer_call_and_return_conditional_losses, lstm_cell_8_layer_call_fn while saving (showing 5 of 20). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: ../results/bidirectional_lstm_with_return_sequences_on_embedded/assets
INFO:tensorflow:Assets written to: ../results/bidirectional_lstm_with_return_sequences_on_embedded/assets WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7d568070> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7edeb580> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7ede5a30> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function. WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f6d7d5fdbb0> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function.
Model: "bidirectional_lstm_with_return_sequences_on_embedded" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= text_vectorization_8 (TextV (None, 30) 0 ectorization) embedding_7 (Embedding) (None, 30, 100) 1000000 bidirectional_1 (Bidirectio (None, 30, 200) 160800 nal) bidirectional_2 (Bidirectio (None, 200) 240800 nal) dense_21 (Dense) (None, 100) 20100 dropout_7 (Dropout) (None, 100) 0 dense_22 (Dense) (None, 10) 1010 dense_23 (Dense) (None, 1) 11 ================================================================= Total params: 1,422,721 Trainable params: 1,422,721 Non-trainable params: 0 _________________________________________________________________ None
y_train_pred_proba = model.predict(
X_train,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_train_pred = [round(pred_proba[0]) for pred_proba in y_train_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_train,
y_train,
y_train_pred,
y_train_pred_proba,
title="Train set results",
)
y_test_pred_proba = model.predict(
X_test,
batch_size=128,
workers=4,
use_multiprocessing=True,
)
y_test_pred = [round(pred_proba[0]) for pred_proba in y_test_pred_proba]
viz_helpers.plot_classifier_results(
model,
X_test,
y_test,
y_test_pred,
y_test_pred_proba,
title="Test set results",
)
The performances on the train dataset is better than on the test datasets which indicates that our model has slightly over-fitted.
The performances on the dataset are slightly better than our previous models :
Our model is much almost not biased biased! It still slightly leans towards the NEGATIVE class : it predicted only 4.0% (baseline = 35% , -89%) more NEGATIVE (163140) messages than POSITIVE (156860).