RNNs for Text Classification with Keras¶

Download & explore 20newsgroups dataset¶

In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train')  #, remove=('headers', 'footers', 'quotes'))
print("Catergories")
print(twenty_train.target_names)
print("-------------")
print("First dataset's sample")
print("\n".join(twenty_train.data[0].split("\n")))
print("------------")
print("First dataset's sample category: ",twenty_train.target[0])
Catergories
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
-------------
First dataset's sample
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





------------
First dataset's sample category:  7

Split dataset into train (70%) & validation (30%)¶

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(twenty_train.data, twenty_train.target, test_size=0.3, random_state=12547392)

twenty_train = fetch_20newsgroups(subset='test')
X_test, y_test = twenty_train.data[:1000], twenty_train.target[:1000]

print('Train samples: {}'.format(len(X_train)))
print('Val samples: {}'.format(len(X_val)))
Train samples: 7919
Val samples: 3395

Use spacy for sentence splitting & tokenization¶

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm',disable=["tagger", "parser","ner"])
nlp.add_pipe('sentencizer')

def tokenize_samples(samples):

  tokenized_samples = []
  for i in tqdm(range(len(samples))):
    doc = nlp(samples[i])  # Tokenize the sample into sentences
    tokens = []
    for sent in doc.sents:
      for tok in sent:  # Iterate through the words of the sentence
        if '\n' in tok.text or "\t" in tok.text or "--" in tok.text or "*" in tok.text or tok.text.lower() in STOP_WORDS:
          continue
        if tok.text.strip():
          tokens.append(tok.text.replace('"',"'").strip())
    tokenized_samples.append(tokens)

  return tokenized_samples

X_train_tokenized = tokenize_samples(X_train)
X_val_tokenized = tokenize_samples(X_val)
X_test_tokenized = tokenize_samples(X_test)
  0%|          | 0/7919 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/spacy/pipeline/lemmatizer.py:211: UserWarning: [W108] The rule-based lemmatizer did not find POS annotation for one or more tokens. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'.
  warnings.warn(Warnings.W108)
100%|██████████| 7919/7919 [04:14<00:00, 31.07it/s]
100%|██████████| 3395/3395 [01:59<00:00, 28.49it/s]
100%|██████████| 1000/1000 [00:25<00:00, 39.15it/s]
In [4]:
import numpy as np
# Get mean and std for length on training set
print('Average length of smples: {}'.format(np.mean([len(x) for x in X_train_tokenized])))
print('Std length of samples: {}'.format(np.std([len(x) for x in X_train_tokenized])))
print('#Samples with length > 1000: {} \n'.format(np.sum([len(x) > 1000 for x in X_train_tokenized])))
print('X_example: {}'.format(X_train_tokenized[0]))
Average length of smples: 240.10670539209497
Std length of samples: 457.42933996960267
#Samples with length > 1000: 178 

X_example: [':', 'kastle@wpi', '.', 'WPI.EDU', '(', 'Jacques', 'W', 'Brouillette', ')', 'Subject', ':', ':', 'WARNING', '.....', '(please', 'read', ')', '...', 'Organization', ':', 'Worcester', 'Polytechnic', 'Institute', 'Lines', ':', '8', 'Distribution', ':', 'world', 'NNTP', '-', 'Posting', '-', 'Host', ':', 'wpi.wpi.edu', 'Keywords', ':', 'BRICK', ',', 'TRUCK', ',', 'DANGER', 'plase', 'cease', 'discussion', '.', 'fail', 'people', 'feel', 'need', 'expound', 'issue', 'days', 'days', 'end', '.', 'areas', 'meant', 'type', 'discussion', '.', 'feel', 'need', 'things', ',', 'thought', '.', 'Thanks', '.', ':', 'want', 'things', 'world', ',', '58', 'Plymouth', 'small', ':', ':', 'OPEC', 'nation', 'fuel', '.', 'good', ':', ':', 'thing', '.', 'Car', 'Smashers', 'home', 'sulk', '.', ':', ':', 'Jacques', 'Brouillette', 'Manufacturing', 'Engineering', ':']

Convert labels to 1-hot vectors¶

In [5]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
target_list = twenty_train.target_names

y_train_1_hot = lb.fit_transform([target_list[x] for x in y_train])
y_val_1_hot = lb.transform([target_list[x] for x in y_val])

y_test_1_hot = lb.transform([target_list[x] for x in y_test])

print('Y_example: {}'.format(y_train_1_hot[0]))
Y_example: [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]

Custom Keras callback for calculating f1, precision, recall at the end of each epoch¶

In [6]:
import tensorflow as tf
from sklearn.metrics import f1_score, recall_score, precision_score
import numpy as np
import os


class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = self.validation_data[1]
        if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
            val_targ = np.argmax(val_targ, -1)
        val_targ = tf.cast(val_targ,dtype=tf.float32)


        _val_f1 = f1_score(val_targ, val_predict,average="weighted")
        _val_recall = recall_score(val_targ, val_predict,average="weighted")
        _val_precision = precision_score(val_targ, val_predict,average="weighted")

        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" % (_val_f1, _val_precision, _val_recall))
        return

Prepare input sequences¶

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 250
MAX_WORDS = 100000

# Init tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS)
# num_words: the maximum number of words to keep, based on word frequency.
# oov_token: will be used to replace OOV WORDS

# Fit tokenizer (Updates internal vocabulary based on a list of texts.)
tokenizer.fit_on_texts([" ".join(x) for x in X_train_tokenized])

# Converts text to sequences of IDs
train_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_train_tokenized])
val_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_val_tokenized])
test_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_test_tokenized])

train_data = pad_sequences(train_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
val_data = pad_sequences(val_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_data = pad_sequences(test_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
In [16]:
test_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_test_tokenized])
test_data = pad_sequences(test_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

2 different Options:¶

  1. Custom keras Embeddings
  2. Pretrained Word2Vec embeddings

Create and train a BiGRU + MLP model with custom end2end embeddings randomly initialized¶

In [11]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GRU, Embedding
from tensorflow.keras.optimizers import Adam

BATCH_SIZE=256
EPOCHS=30
GRU_SIZE = 64
DENSE = 32

MAX_WORDS = 100000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 250

# create an empty sequential model
model = Sequential()

# Αdd an embedding layer
model.add(Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM, # weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,mask_zero=True, trainable=True)) #trainable=False))

# Αdd a bidirectional gru layer with 0.33 variational (recurrent) dropout
model.add(Bidirectional(GRU(GRU_SIZE, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), return_sequences=False, recurrent_dropout = 0.33)))
# return_sequences=False: Whether to return the last output in the output sequence, or the full sequence.

# Αdd a hidden MLP layer
model.add(Dropout(0.33))
model.add(Dense(DENSE, activation='relu' ))

# Αdd the output MLP layer
model.add(Dropout(0.33))
model.add(Dense(len(twenty_train.target_names), activation='softmax'))
# Multi-class classification -> Use softmax over all possible classes

# model.build((None, EMBEDDING_DIM, VECTOR_DIMENSION))

print(model.summary())
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=["accuracy"])

# Save model weights after each epoch with ModelCheckpoint
# IF I WANTED TO USE GDRIVE
# '/content/gdrive/My Drive/checkpoints'
if not os.path.exists('/content/checkpoints'):
  os.makedirs('/content/checkpoints')

# '/content/gdrive/My Drive/checkpoints/BiGRUMLP.hdf5'


checkpoint = ModelCheckpoint('/content/checkpoints/BiGRUMLP.hdf5',
                              monitor='val_accuracy',
                              mode='max', verbose=2,
                              save_best_only=True,
                              save_weights_only=True)

history = model.fit(train_data,
                    y_train_1_hot,
                    validation_data=(val_data, y_val_1_hot),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    shuffle=True,
                    callbacks=[Metrics(valid_data=(val_data, y_val_1_hot)),
                    checkpoint])
WARNING:tensorflow:Layer gru_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_3 (Embedding)     (None, 250, 100)          10000200  
                                                                 
 bidirectional_2 (Bidirecti  (None, 128)               63744     
 onal)                                                           
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 20)                660       
                                                                 
=================================================================
Total params: 10068732 (38.41 MB)
Trainable params: 10068732 (38.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/30
107/107 [==============================] - 17s 149ms/step
 — val_f1: 0.030140 — val_precision: 0.120475 — val_recall: 0.068630

Epoch 1: val_accuracy improved from -inf to 0.06863, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 111s 3s/step - loss: 2.9862 - accuracy: 0.0659 - val_loss: 2.9669 - val_accuracy: 0.0686 - val_f1: 0.0301 - val_recall: 0.0686 - val_precision: 0.1205
Epoch 2/30
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
107/107 [==============================] - 15s 144ms/step
 — val_f1: 0.160939 — val_precision: 0.379353 — val_recall: 0.192636

Epoch 2: val_accuracy improved from 0.06863 to 0.19264, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 88s 3s/step - loss: 2.8937 - accuracy: 0.1137 - val_loss: 2.7587 - val_accuracy: 0.1926 - val_f1: 0.1609 - val_recall: 0.1926 - val_precision: 0.3794
Epoch 3/30
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
107/107 [==============================] - 16s 146ms/step
 — val_f1: 0.321508 — val_precision: 0.447378 — val_recall: 0.368189

Epoch 3: val_accuracy improved from 0.19264 to 0.36819, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 83s 3s/step - loss: 2.4954 - accuracy: 0.2279 - val_loss: 2.2604 - val_accuracy: 0.3682 - val_f1: 0.3215 - val_recall: 0.3682 - val_precision: 0.4474
Epoch 4/30
107/107 [==============================] - 15s 143ms/step
 — val_f1: 0.509427 — val_precision: 0.545646 — val_recall: 0.520471

Epoch 4: val_accuracy improved from 0.36819 to 0.52047, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 86s 3s/step - loss: 1.8926 - accuracy: 0.4070 - val_loss: 1.7155 - val_accuracy: 0.5205 - val_f1: 0.5094 - val_recall: 0.5205 - val_precision: 0.5456
Epoch 5/30
107/107 [==============================] - 16s 147ms/step
 — val_f1: 0.628164 — val_precision: 0.642008 — val_recall: 0.624153

Epoch 5: val_accuracy improved from 0.52047 to 0.62415, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 84s 3s/step - loss: 1.2737 - accuracy: 0.5839 - val_loss: 1.3646 - val_accuracy: 0.6242 - val_f1: 0.6282 - val_recall: 0.6242 - val_precision: 0.6420
Epoch 6/30
107/107 [==============================] - 15s 139ms/step
 — val_f1: 0.685028 — val_precision: 0.691521 — val_recall: 0.683652

Epoch 6: val_accuracy improved from 0.62415 to 0.68365, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 83s 3s/step - loss: 0.8138 - accuracy: 0.7429 - val_loss: 1.1135 - val_accuracy: 0.6837 - val_f1: 0.6850 - val_recall: 0.6837 - val_precision: 0.6915
Epoch 7/30
107/107 [==============================] - 15s 138ms/step
 — val_f1: 0.709699 — val_precision: 0.717580 — val_recall: 0.708395

Epoch 7: val_accuracy improved from 0.68365 to 0.70839, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 82s 3s/step - loss: 0.5517 - accuracy: 0.8327 - val_loss: 1.0663 - val_accuracy: 0.7084 - val_f1: 0.7097 - val_recall: 0.7084 - val_precision: 0.7176
Epoch 8/30
107/107 [==============================] - 16s 147ms/step
 — val_f1: 0.735699 — val_precision: 0.744867 — val_recall: 0.734610

Epoch 8: val_accuracy improved from 0.70839 to 0.73461, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 78s 3s/step - loss: 0.4017 - accuracy: 0.8759 - val_loss: 0.9914 - val_accuracy: 0.7346 - val_f1: 0.7357 - val_recall: 0.7346 - val_precision: 0.7449
Epoch 9/30
107/107 [==============================] - 15s 141ms/step
 — val_f1: 0.738030 — val_precision: 0.744813 — val_recall: 0.737555

Epoch 9: val_accuracy improved from 0.73461 to 0.73756, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 77s 3s/step - loss: 0.2943 - accuracy: 0.9156 - val_loss: 1.0110 - val_accuracy: 0.7376 - val_f1: 0.7380 - val_recall: 0.7376 - val_precision: 0.7448
Epoch 10/30
107/107 [==============================] - 15s 139ms/step
 — val_f1: 0.744511 — val_precision: 0.751991 — val_recall: 0.744330

Epoch 10: val_accuracy improved from 0.73756 to 0.74433, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 83s 3s/step - loss: 0.2451 - accuracy: 0.9309 - val_loss: 1.0448 - val_accuracy: 0.7443 - val_f1: 0.7445 - val_recall: 0.7443 - val_precision: 0.7520
Epoch 11/30
107/107 [==============================] - 15s 142ms/step
 — val_f1: 0.756774 — val_precision: 0.762311 — val_recall: 0.756996

Epoch 11: val_accuracy improved from 0.74433 to 0.75700, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 81s 3s/step - loss: 0.1945 - accuracy: 0.9403 - val_loss: 1.0450 - val_accuracy: 0.7570 - val_f1: 0.7568 - val_recall: 0.7570 - val_precision: 0.7623
Epoch 12/30
107/107 [==============================] - 15s 140ms/step
 — val_f1: 0.750578 — val_precision: 0.756629 — val_recall: 0.750515

Epoch 12: val_accuracy did not improve from 0.75700
31/31 [==============================] - 78s 3s/step - loss: 0.1742 - accuracy: 0.9499 - val_loss: 1.0930 - val_accuracy: 0.7505 - val_f1: 0.7506 - val_recall: 0.7505 - val_precision: 0.7566
Epoch 13/30
107/107 [==============================] - 16s 147ms/step
 — val_f1: 0.749980 — val_precision: 0.756777 — val_recall: 0.749926

Epoch 13: val_accuracy did not improve from 0.75700
31/31 [==============================] - 82s 3s/step - loss: 0.1530 - accuracy: 0.9544 - val_loss: 1.1329 - val_accuracy: 0.7499 - val_f1: 0.7500 - val_recall: 0.7499 - val_precision: 0.7568
Epoch 14/30
107/107 [==============================] - 15s 139ms/step
 — val_f1: 0.756778 — val_precision: 0.763451 — val_recall: 0.756701

Epoch 14: val_accuracy did not improve from 0.75700
31/31 [==============================] - 82s 3s/step - loss: 0.1350 - accuracy: 0.9615 - val_loss: 1.1292 - val_accuracy: 0.7567 - val_f1: 0.7568 - val_recall: 0.7567 - val_precision: 0.7635
Epoch 15/30
107/107 [==============================] - 15s 138ms/step
 — val_f1: 0.746010 — val_precision: 0.765309 — val_recall: 0.744919

Epoch 15: val_accuracy did not improve from 0.75700
31/31 [==============================] - 81s 3s/step - loss: 0.1330 - accuracy: 0.9620 - val_loss: 1.2302 - val_accuracy: 0.7449 - val_f1: 0.7460 - val_recall: 0.7449 - val_precision: 0.7653
Epoch 16/30
107/107 [==============================] - 15s 142ms/step
 — val_f1: 0.768306 — val_precision: 0.772954 — val_recall: 0.768483

Epoch 16: val_accuracy improved from 0.75700 to 0.76848, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 82s 3s/step - loss: 0.1152 - accuracy: 0.9670 - val_loss: 1.1630 - val_accuracy: 0.7685 - val_f1: 0.7683 - val_recall: 0.7685 - val_precision: 0.7730
Epoch 17/30
107/107 [==============================] - 15s 140ms/step
 — val_f1: 0.768201 — val_precision: 0.776478 — val_recall: 0.767894

Epoch 17: val_accuracy did not improve from 0.76848
31/31 [==============================] - 82s 3s/step - loss: 0.0932 - accuracy: 0.9741 - val_loss: 1.2059 - val_accuracy: 0.7679 - val_f1: 0.7682 - val_recall: 0.7679 - val_precision: 0.7765
Epoch 18/30
107/107 [==============================] - 15s 139ms/step
 — val_f1: 0.766647 — val_precision: 0.775616 — val_recall: 0.765538

Epoch 18: val_accuracy did not improve from 0.76848
31/31 [==============================] - 82s 3s/step - loss: 0.0916 - accuracy: 0.9737 - val_loss: 1.2364 - val_accuracy: 0.7655 - val_f1: 0.7666 - val_recall: 0.7655 - val_precision: 0.7756
Epoch 19/30
107/107 [==============================] - 15s 144ms/step
 — val_f1: 0.765953 — val_precision: 0.773043 — val_recall: 0.766127

Epoch 19: val_accuracy did not improve from 0.76848
31/31 [==============================] - 81s 3s/step - loss: 0.0811 - accuracy: 0.9779 - val_loss: 1.2450 - val_accuracy: 0.7661 - val_f1: 0.7660 - val_recall: 0.7661 - val_precision: 0.7730
Epoch 20/30
107/107 [==============================] - 15s 138ms/step
 — val_f1: 0.775256 — val_precision: 0.778580 — val_recall: 0.775258

Epoch 20: val_accuracy improved from 0.76848 to 0.77526, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 75s 2s/step - loss: 0.0821 - accuracy: 0.9768 - val_loss: 1.2009 - val_accuracy: 0.7753 - val_f1: 0.7753 - val_recall: 0.7753 - val_precision: 0.7786
Epoch 21/30
107/107 [==============================] - 15s 140ms/step
 — val_f1: 0.772409 — val_precision: 0.777390 — val_recall: 0.772312

Epoch 21: val_accuracy did not improve from 0.77526
31/31 [==============================] - 75s 2s/step - loss: 0.0812 - accuracy: 0.9764 - val_loss: 1.2900 - val_accuracy: 0.7723 - val_f1: 0.7724 - val_recall: 0.7723 - val_precision: 0.7774
Epoch 22/30
107/107 [==============================] - 15s 137ms/step
 — val_f1: 0.778782 — val_precision: 0.782813 — val_recall: 0.778792

Epoch 22: val_accuracy improved from 0.77526 to 0.77879, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 73s 2s/step - loss: 0.0648 - accuracy: 0.9821 - val_loss: 1.2483 - val_accuracy: 0.7788 - val_f1: 0.7788 - val_recall: 0.7788 - val_precision: 0.7828
Epoch 23/30
107/107 [==============================] - 15s 138ms/step
 — val_f1: 0.782058 — val_precision: 0.790398 — val_recall: 0.781149

Epoch 23: val_accuracy improved from 0.77879 to 0.78115, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 76s 2s/step - loss: 0.0627 - accuracy: 0.9802 - val_loss: 1.3588 - val_accuracy: 0.7811 - val_f1: 0.7821 - val_recall: 0.7811 - val_precision: 0.7904
Epoch 24/30
107/107 [==============================] - 15s 137ms/step
 — val_f1: 0.773286 — val_precision: 0.780003 — val_recall: 0.772018

Epoch 24: val_accuracy did not improve from 0.78115
31/31 [==============================] - 81s 3s/step - loss: 0.0627 - accuracy: 0.9817 - val_loss: 1.3648 - val_accuracy: 0.7720 - val_f1: 0.7733 - val_recall: 0.7720 - val_precision: 0.7800
Epoch 25/30
107/107 [==============================] - 15s 138ms/step
 — val_f1: 0.777916 — val_precision: 0.785783 — val_recall: 0.777320

Epoch 25: val_accuracy did not improve from 0.78115
31/31 [==============================] - 81s 3s/step - loss: 0.0576 - accuracy: 0.9835 - val_loss: 1.3201 - val_accuracy: 0.7773 - val_f1: 0.7779 - val_recall: 0.7773 - val_precision: 0.7858
Epoch 26/30
107/107 [==============================] - 15s 141ms/step
 — val_f1: 0.775969 — val_precision: 0.782348 — val_recall: 0.775258

Epoch 26: val_accuracy did not improve from 0.78115
31/31 [==============================] - 81s 3s/step - loss: 0.0562 - accuracy: 0.9846 - val_loss: 1.3614 - val_accuracy: 0.7753 - val_f1: 0.7760 - val_recall: 0.7753 - val_precision: 0.7823
Epoch 27/30
107/107 [==============================] - 15s 137ms/step
 — val_f1: 0.769021 — val_precision: 0.779069 — val_recall: 0.768778

Epoch 27: val_accuracy did not improve from 0.78115
31/31 [==============================] - 80s 3s/step - loss: 0.0591 - accuracy: 0.9818 - val_loss: 1.4699 - val_accuracy: 0.7688 - val_f1: 0.7690 - val_recall: 0.7688 - val_precision: 0.7791
Epoch 28/30
107/107 [==============================] - 15s 144ms/step
 — val_f1: 0.776310 — val_precision: 0.781976 — val_recall: 0.776730

Epoch 28: val_accuracy did not improve from 0.78115
31/31 [==============================] - 76s 2s/step - loss: 0.0524 - accuracy: 0.9854 - val_loss: 1.4153 - val_accuracy: 0.7767 - val_f1: 0.7763 - val_recall: 0.7767 - val_precision: 0.7820
Epoch 29/30
107/107 [==============================] - 15s 143ms/step
 — val_f1: 0.784216 — val_precision: 0.791574 — val_recall: 0.784094

Epoch 29: val_accuracy improved from 0.78115 to 0.78409, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 80s 3s/step - loss: 0.0473 - accuracy: 0.9852 - val_loss: 1.4511 - val_accuracy: 0.7841 - val_f1: 0.7842 - val_recall: 0.7841 - val_precision: 0.7916
Epoch 30/30
107/107 [==============================] - 15s 136ms/step
 — val_f1: 0.787878 — val_precision: 0.793807 — val_recall: 0.787629

Epoch 30: val_accuracy improved from 0.78409 to 0.78763, saving model to /content/checkpoints/BiGRUMLP.hdf5
31/31 [==============================] - 81s 3s/step - loss: 0.0460 - accuracy: 0.9874 - val_loss: 1.4352 - val_accuracy: 0.7876 - val_f1: 0.7879 - val_recall: 0.7876 - val_precision: 0.7938

Visualize Model's Training History¶

In [12]:
%matplotlib inline
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()

Evaluate performance of BiGRU + MLP model on dev data¶

In [15]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report


GRU_SIZE = 64
DENSE = 32

with tf.device('/device:GPU:0'):

  model = Sequential()
  model.add(Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM, # weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,mask_zero=True, trainable=True)) #trainable=False))
  model.add(Bidirectional(GRU(GRU_SIZE, return_sequences=False, recurrent_dropout = 0.33)))
  model.add(Dense(DENSE, activation='relu' ))
  model.add(Dense(len(twenty_train.target_names), activation='softmax'))

  # Load weights from the pre-trained model
  model.load_weights("/content/checkpoints/BiGRUMLP.hdf5")
  # model.load_weights("/content/gdrive/My Drive/checkpoints/BiGRUMLP.hdf5")

  predictions = np.argmax(model.predict(val_data), -1)
  print(classification_report(y_val, predictions, target_names=twenty_train.target_names))
WARNING:tensorflow:Layer gru_5 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_5 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer gru_5 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
107/107 [==============================] - 23s 203ms/step
                          precision    recall  f1-score   support

             alt.atheism       0.88      0.83      0.85       160
           comp.graphics       0.73      0.68      0.71       165
 comp.os.ms-windows.misc       0.73      0.78      0.76       189
comp.sys.ibm.pc.hardware       0.70      0.63      0.66       168
   comp.sys.mac.hardware       0.61      0.80      0.69       182
          comp.windows.x       0.80      0.66      0.73       168
            misc.forsale       0.71      0.65      0.68       182
               rec.autos       0.83      0.74      0.78       181
         rec.motorcycles       0.82      0.81      0.81       184
      rec.sport.baseball       0.92      0.79      0.85       169
        rec.sport.hockey       0.80      0.90      0.85       175
               sci.crypt       0.91      0.93      0.92       177
         sci.electronics       0.84      0.72      0.78       173
                 sci.med       0.82      0.86      0.84       181
               sci.space       0.76      0.90      0.82       181
  soc.religion.christian       0.80      0.78      0.79       177
      talk.politics.guns       0.83      0.85      0.84       177
   talk.politics.mideast       0.96      0.85      0.90       170
      talk.politics.misc       0.74      0.84      0.79       135
      talk.religion.misc       0.64      0.74      0.69       101

                accuracy                           0.79      3395
               macro avg       0.79      0.79      0.79      3395
            weighted avg       0.79      0.79      0.79      3395

In [17]:
from sklearn.metrics import accuracy_score
predictions = np.argmax(model.predict(val_data), -1)
print(f'Validation Accuracy: {accuracy_score(y_val, predictions)*100:.2f}%')

predictions = np.argmax(model.predict(test_data), -1)
print(f'Test Accuracy:{accuracy_score(y_test, predictions)*100:.2f}%')
107/107 [==============================] - 16s 144ms/step
Validation Accuracy: 78.76%
32/32 [==============================] - 4s 132ms/step
Test Accuracy:67.70%

Custom keras layer for linear and deep self-attention over RNNs output states¶

In [18]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer
import numpy as np


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
      return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
      return K.dot(x, kernel)


class LinearAttention(Layer):
    def __init__(self,
                 kernel_regularizer=None, bias_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        #apply penalties on layer parameters or layer activity during optimization.
        #These penalties are summed into the loss function that the network optimizes.
        self.W_regularizer = regularizers.get(kernel_regularizer)
        self.b_regularizer = regularizers.get(bias_regularizer)

        #setting constraints (eg. non-negativity) on model parameters during training.
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_attention = return_attention
        super(LinearAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        #input_shape[-1] = 600
        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(1,),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, inputs, mask=None):
        # do not pass the mask to the next layers
        if self.return_attention:
            return [None, None]
        return None

    def call(self, x, mask=None):

        # eij = Wx + b
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        # Apply mask
        if mask is not None:
            eij *= K.cast(mask, K.floatx())

        # a = softmax(eij)
        # shape_of_a = 1000 (sequence length/time steps)
        a = K.expand_dims(K.softmax(eij, axis=-1))
        # position wise multiplication -> shape = 600x1000
        weighted_input = x * a
        # shape -> 600
        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            #input_shape[-1] -> 600
            #input_shape[1]-> 1000
            #input_shape[0] -> batch_size/number of samples
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]


class DeepAttention(Layer):
    def __init__(self,
                 kernel_regularizer=None, u_regularizer=None, bias_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(kernel_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b1_regularizer = regularizers.get(bias_regularizer)
        self.b2_regularizer = regularizers.get(bias_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b1_constraint = constraints.get(b_constraint)
        self.b2_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_attention = return_attention
        super(DeepAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b1 = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b1'.format(self.name),
                                     regularizer=self.b1_regularizer,
                                     constraint=self.b1_constraint)
            self.b2 = self.add_weight(shape=(1,),
                                     initializer='zero',
                                     name='{}_b2'.format(self.name),
                                     regularizer=self.b2_regularizer,
                                     constraint=self.b2_constraint)
        else:
            self.b1 = None
            self.b2 = None

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        self.built = True


    def compute_mask(self, inputs, mask=None):
        # do not pass the mask to the next layers
        if self.return_attention:
            return [None, None]
        return None

    def call(self, x, mask=None):
        # uit = tanh(Wx + b)
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b1

        uit = K.tanh(uit)

        # ait = softmax(Ueij)
        eij = dot_product(uit, self.u)
        if self.bias:
            eij += self.b2

        # Apply mask
        if mask is not None:
            eij *= K.cast(mask, K.floatx())

        a = K.expand_dims(K.softmax(eij, axis=-1))

        weighted_input = x * a
        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

Create and train a BiLSTM + deep self-attention + MLP model¶

In [20]:
!pip install keras-self-attention
Collecting keras-self-attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from keras-self-attention) (1.23.5)
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... done
  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18895 sha256=b05f34a5aa9893dda77dfe7899801c2b2281880eff229aac13edd33dd0722c36
  Stored in directory: /root/.cache/pip/wheels/b8/f7/24/607b483144fb9c47b4ba2c5fba6b68e54aeee2d5bf6c05302e
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0
In [21]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM, Embedding, Input
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from tensorflow.keras import Model

from keras_self_attention import SeqSelfAttention

LSTM_SIZE = 300
DENSE = 1000

with tf.device('/device:GPU:0'):

  inputs = Input((MAX_SEQUENCE_LENGTH,))

  # Define the Embedding Layer with fastext weights
  embeddings = Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM, #weights=[embedding_matrix],
                         input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=True)(inputs) #trainable=False)(inputs)
  drop_emb = Dropout(0.33)(embeddings)

  # Define a (Biderectional) RNN with LSTM cells
  bilstm = Bidirectional(LSTM(units=LSTM_SIZE, return_sequences=True, recurrent_dropout=0.33))(drop_emb)
  drop_encodings = Dropout(0.33)(bilstm)

  # Pass the encoding through an Attension Layer
  x, attn = DeepAttention(return_attention=True)(drop_encodings)
  # x, attn = LinearAttention(return_attention=True)(drop_encodings)


  # Alternatively use keras package for self-attention
  #x, attn = SeqSelfAttention(return_attention=True)(drop_encodings)

  # Apply Droupout to the encoding produced by the attension mechanism
  drop_x = Dropout(0.33)(x)

  # Pass thruogh a Dense Layer
  hidden = Dense(units=DENSE, activation="relu")(drop_x)

  # Apply Dropout to the output of the Dense Layer
  drop_out = Dropout(0.33)(hidden)

  # Last pass through a Dense Layer with softmax activation to produce a probability distribution
  out = Dense(units=len(twenty_train.target_names), activation="softmax")(drop_out)

  # Wrap model --> Remember Functional API
  model2 = Model(inputs=inputs, outputs=out)
  print(model2.summary())

  model2.compile(loss='categorical_crossentropy',
                 optimizer=Adam(learning_rate=0.001),
                 metrics=["accuracy"])

  if not os.path.exists('/content/checkpoints'):
    os.makedirs('/content/checkpoints')

  checkpoint = ModelCheckpoint('/content/checkpoints/BiLSTM_attn.hdf5',
                               monitor='val_accuracy',
                               mode='max', verbose=2,
                               save_best_only=True,
                               save_weights_only=True)

  history2 = model2.fit(train_data, y_train_1_hot,
                        validation_data=(val_data, y_val_1_hot),
                        batch_size=128,
                        epochs=30,
                        shuffle=True,
                        callbacks=[Metrics(valid_data=(val_data, y_val_1_hot)),
                        checkpoint])
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
/usr/local/lib/python3.10/dist-packages/keras/src/initializers/initializers.py:120: UserWarning: The initializer GlorotUniform is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initializer instance more than once.
  warnings.warn(
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 250)]             0         
                                                                 
 embedding_6 (Embedding)     (None, 250, 100)          10000200  
                                                                 
 dropout_6 (Dropout)         (None, 250, 100)          0         
                                                                 
 bidirectional_6 (Bidirecti  (None, 250, 600)          962400    
 onal)                                                           
                                                                 
 dropout_7 (Dropout)         (None, 250, 600)          0         
                                                                 
 deep_attention (DeepAttent  [(None, 600),             361201    
 ion)                         (None, 250, 1)]                    
                                                                 
 dropout_8 (Dropout)         (None, 600)               0         
                                                                 
 dense_12 (Dense)            (None, 1000)              601000    
                                                                 
 dropout_9 (Dropout)         (None, 1000)              0         
                                                                 
 dense_13 (Dense)            (None, 20)                20020     
                                                                 
=================================================================
Total params: 11944821 (45.57 MB)
Trainable params: 11944821 (45.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.Adam.
None
Epoch 1/30
107/107 [==============================] - 20s 183ms/step
 — val_f1: 0.122530 — val_precision: 0.150319 — val_recall: 0.209720

Epoch 1: val_accuracy improved from -inf to 0.20972, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 239s 3s/step - loss: 2.7477 - accuracy: 0.1183 - val_loss: 2.1294 - val_accuracy: 0.2097 - val_f1: 0.1225 - val_recall: 0.2097 - val_precision: 0.1503
Epoch 2/30
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
107/107 [==============================] - 19s 176ms/step
 — val_f1: 0.495426 — val_precision: 0.514294 — val_recall: 0.527246

Epoch 2: val_accuracy improved from 0.20972 to 0.52725, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 172s 3s/step - loss: 1.6186 - accuracy: 0.4012 - val_loss: 1.3446 - val_accuracy: 0.5272 - val_f1: 0.4954 - val_recall: 0.5272 - val_precision: 0.5143
Epoch 3/30
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
107/107 [==============================] - 17s 162ms/step
 — val_f1: 0.711097 — val_precision: 0.737267 — val_recall: 0.710457

Epoch 3: val_accuracy improved from 0.52725 to 0.71046, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 175s 3s/step - loss: 0.7983 - accuracy: 0.7082 - val_loss: 0.9146 - val_accuracy: 0.7105 - val_f1: 0.7111 - val_recall: 0.7105 - val_precision: 0.7373
Epoch 4/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.778107 — val_precision: 0.788423 — val_recall: 0.776436

Epoch 4: val_accuracy improved from 0.71046 to 0.77644, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 165s 3s/step - loss: 0.3040 - accuracy: 0.9063 - val_loss: 0.7610 - val_accuracy: 0.7764 - val_f1: 0.7781 - val_recall: 0.7764 - val_precision: 0.7884
Epoch 5/30
107/107 [==============================] - 18s 171ms/step
 — val_f1: 0.776559 — val_precision: 0.791935 — val_recall: 0.773196

Epoch 5: val_accuracy did not improve from 0.77644
62/62 [==============================] - 165s 3s/step - loss: 0.1214 - accuracy: 0.9646 - val_loss: 0.9098 - val_accuracy: 0.7732 - val_f1: 0.7766 - val_recall: 0.7732 - val_precision: 0.7919
Epoch 6/30
107/107 [==============================] - 18s 169ms/step
 — val_f1: 0.777996 — val_precision: 0.796160 — val_recall: 0.773196

Epoch 6: val_accuracy did not improve from 0.77644
62/62 [==============================] - 164s 3s/step - loss: 0.0703 - accuracy: 0.9821 - val_loss: 1.0266 - val_accuracy: 0.7732 - val_f1: 0.7780 - val_recall: 0.7732 - val_precision: 0.7962
Epoch 7/30
107/107 [==============================] - 17s 163ms/step
 — val_f1: 0.803463 — val_precision: 0.816127 — val_recall: 0.799705

Epoch 7: val_accuracy improved from 0.77644 to 0.79971, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 160s 3s/step - loss: 0.0387 - accuracy: 0.9904 - val_loss: 0.9099 - val_accuracy: 0.7997 - val_f1: 0.8035 - val_recall: 0.7997 - val_precision: 0.8161
Epoch 8/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.810377 — val_precision: 0.823782 — val_recall: 0.807069

Epoch 8: val_accuracy improved from 0.79971 to 0.80707, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 167s 3s/step - loss: 0.0354 - accuracy: 0.9925 - val_loss: 0.8670 - val_accuracy: 0.8071 - val_f1: 0.8104 - val_recall: 0.8071 - val_precision: 0.8238
Epoch 9/30
107/107 [==============================] - 17s 162ms/step
 — val_f1: 0.801430 — val_precision: 0.814549 — val_recall: 0.797644

Epoch 9: val_accuracy did not improve from 0.80707
62/62 [==============================] - 157s 3s/step - loss: 0.0167 - accuracy: 0.9960 - val_loss: 0.9553 - val_accuracy: 0.7976 - val_f1: 0.8014 - val_recall: 0.7976 - val_precision: 0.8145
Epoch 10/30
107/107 [==============================] - 19s 175ms/step
 — val_f1: 0.810513 — val_precision: 0.819751 — val_recall: 0.808542

Epoch 10: val_accuracy improved from 0.80707 to 0.80854, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 160s 3s/step - loss: 0.0136 - accuracy: 0.9967 - val_loss: 0.9660 - val_accuracy: 0.8085 - val_f1: 0.8105 - val_recall: 0.8085 - val_precision: 0.8198
Epoch 11/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.815993 — val_precision: 0.826887 — val_recall: 0.812666

Epoch 11: val_accuracy improved from 0.80854 to 0.81267, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 158s 3s/step - loss: 0.0132 - accuracy: 0.9973 - val_loss: 0.9424 - val_accuracy: 0.8127 - val_f1: 0.8160 - val_recall: 0.8127 - val_precision: 0.8269
Epoch 12/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.809659 — val_precision: 0.822246 — val_recall: 0.807069

Epoch 12: val_accuracy did not improve from 0.81267
62/62 [==============================] - 160s 3s/step - loss: 0.0179 - accuracy: 0.9960 - val_loss: 0.9250 - val_accuracy: 0.8071 - val_f1: 0.8097 - val_recall: 0.8071 - val_precision: 0.8222
Epoch 13/30
107/107 [==============================] - 19s 174ms/step
 — val_f1: 0.806898 — val_precision: 0.817460 — val_recall: 0.803829

Epoch 13: val_accuracy did not improve from 0.81267
62/62 [==============================] - 161s 3s/step - loss: 0.0166 - accuracy: 0.9962 - val_loss: 0.9572 - val_accuracy: 0.8038 - val_f1: 0.8069 - val_recall: 0.8038 - val_precision: 0.8175
Epoch 14/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.811613 — val_precision: 0.823340 — val_recall: 0.808542

Epoch 14: val_accuracy did not improve from 0.81267
62/62 [==============================] - 160s 3s/step - loss: 0.0154 - accuracy: 0.9967 - val_loss: 0.8783 - val_accuracy: 0.8085 - val_f1: 0.8116 - val_recall: 0.8085 - val_precision: 0.8233
Epoch 15/30
107/107 [==============================] - 17s 162ms/step
 — val_f1: 0.807784 — val_precision: 0.819169 — val_recall: 0.805891

Epoch 15: val_accuracy did not improve from 0.81267
62/62 [==============================] - 162s 3s/step - loss: 0.0140 - accuracy: 0.9973 - val_loss: 0.9599 - val_accuracy: 0.8059 - val_f1: 0.8078 - val_recall: 0.8059 - val_precision: 0.8192
Epoch 16/30
107/107 [==============================] - 19s 174ms/step
 — val_f1: 0.807657 — val_precision: 0.820539 — val_recall: 0.805891

Epoch 16: val_accuracy did not improve from 0.81267
62/62 [==============================] - 159s 3s/step - loss: 0.0147 - accuracy: 0.9962 - val_loss: 0.9441 - val_accuracy: 0.8059 - val_f1: 0.8077 - val_recall: 0.8059 - val_precision: 0.8205
Epoch 17/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.821950 — val_precision: 0.828624 — val_recall: 0.819735

Epoch 17: val_accuracy improved from 0.81267 to 0.81973, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 158s 3s/step - loss: 0.0122 - accuracy: 0.9973 - val_loss: 0.8409 - val_accuracy: 0.8197 - val_f1: 0.8219 - val_recall: 0.8197 - val_precision: 0.8286
Epoch 18/30
107/107 [==============================] - 18s 168ms/step
 — val_f1: 0.811753 — val_precision: 0.823568 — val_recall: 0.809131

Epoch 18: val_accuracy did not improve from 0.81973
62/62 [==============================] - 162s 3s/step - loss: 0.0083 - accuracy: 0.9981 - val_loss: 1.0236 - val_accuracy: 0.8091 - val_f1: 0.8118 - val_recall: 0.8091 - val_precision: 0.8236
Epoch 19/30
107/107 [==============================] - 19s 174ms/step
 — val_f1: 0.820830 — val_precision: 0.832321 — val_recall: 0.817673

Epoch 19: val_accuracy did not improve from 0.81973
62/62 [==============================] - 162s 3s/step - loss: 0.0116 - accuracy: 0.9967 - val_loss: 0.9014 - val_accuracy: 0.8177 - val_f1: 0.8208 - val_recall: 0.8177 - val_precision: 0.8323
Epoch 20/30
107/107 [==============================] - 18s 164ms/step
 — val_f1: 0.812182 — val_precision: 0.821446 — val_recall: 0.810898

Epoch 20: val_accuracy did not improve from 0.81973
62/62 [==============================] - 160s 3s/step - loss: 0.0119 - accuracy: 0.9966 - val_loss: 0.9161 - val_accuracy: 0.8109 - val_f1: 0.8122 - val_recall: 0.8109 - val_precision: 0.8214
Epoch 21/30
107/107 [==============================] - 17s 163ms/step
 — val_f1: 0.822124 — val_precision: 0.833710 — val_recall: 0.819146

Epoch 21: val_accuracy did not improve from 0.81973
62/62 [==============================] - 163s 3s/step - loss: 0.0127 - accuracy: 0.9973 - val_loss: 0.9170 - val_accuracy: 0.8191 - val_f1: 0.8221 - val_recall: 0.8191 - val_precision: 0.8337
Epoch 22/30
107/107 [==============================] - 19s 174ms/step
 — val_f1: 0.818140 — val_precision: 0.827724 — val_recall: 0.816200

Epoch 22: val_accuracy did not improve from 0.81973
62/62 [==============================] - 160s 3s/step - loss: 0.0096 - accuracy: 0.9975 - val_loss: 0.9062 - val_accuracy: 0.8162 - val_f1: 0.8181 - val_recall: 0.8162 - val_precision: 0.8277
Epoch 23/30
107/107 [==============================] - 18s 165ms/step
 — val_f1: 0.795488 — val_precision: 0.808043 — val_recall: 0.793520

Epoch 23: val_accuracy did not improve from 0.81973
62/62 [==============================] - 160s 3s/step - loss: 0.0158 - accuracy: 0.9960 - val_loss: 1.0718 - val_accuracy: 0.7935 - val_f1: 0.7955 - val_recall: 0.7935 - val_precision: 0.8080
Epoch 24/30
107/107 [==============================] - 17s 163ms/step
 — val_f1: 0.822440 — val_precision: 0.832766 — val_recall: 0.820029

Epoch 24: val_accuracy improved from 0.81973 to 0.82003, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 162s 3s/step - loss: 0.0147 - accuracy: 0.9967 - val_loss: 0.8964 - val_accuracy: 0.8200 - val_f1: 0.8224 - val_recall: 0.8200 - val_precision: 0.8328
Epoch 25/30
107/107 [==============================] - 18s 167ms/step
 — val_f1: 0.827384 — val_precision: 0.838000 — val_recall: 0.824742

Epoch 25: val_accuracy improved from 0.82003 to 0.82474, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 157s 3s/step - loss: 0.0114 - accuracy: 0.9976 - val_loss: 0.8981 - val_accuracy: 0.8247 - val_f1: 0.8274 - val_recall: 0.8247 - val_precision: 0.8380
Epoch 26/30
107/107 [==============================] - 18s 167ms/step
 — val_f1: 0.836116 — val_precision: 0.844064 — val_recall: 0.834168

Epoch 26: val_accuracy improved from 0.82474 to 0.83417, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 162s 3s/step - loss: 0.0073 - accuracy: 0.9985 - val_loss: 0.8789 - val_accuracy: 0.8342 - val_f1: 0.8361 - val_recall: 0.8342 - val_precision: 0.8441
Epoch 27/30
107/107 [==============================] - 18s 165ms/step
 — val_f1: 0.837743 — val_precision: 0.843811 — val_recall: 0.836230

Epoch 27: val_accuracy improved from 0.83417 to 0.83623, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 160s 3s/step - loss: 0.0060 - accuracy: 0.9989 - val_loss: 0.8408 - val_accuracy: 0.8362 - val_f1: 0.8377 - val_recall: 0.8362 - val_precision: 0.8438
Epoch 28/30
107/107 [==============================] - 18s 167ms/step
 — val_f1: 0.831491 — val_precision: 0.843039 — val_recall: 0.828277

Epoch 28: val_accuracy did not improve from 0.83623
62/62 [==============================] - 157s 3s/step - loss: 0.0021 - accuracy: 0.9996 - val_loss: 0.9481 - val_accuracy: 0.8283 - val_f1: 0.8315 - val_recall: 0.8283 - val_precision: 0.8430
Epoch 29/30
107/107 [==============================] - 18s 169ms/step
 — val_f1: 0.838640 — val_precision: 0.842258 — val_recall: 0.837408

Epoch 29: val_accuracy improved from 0.83623 to 0.83741, saving model to /content/checkpoints/BiLSTM_attn.hdf5
62/62 [==============================] - 159s 3s/step - loss: 0.0045 - accuracy: 0.9991 - val_loss: 0.8874 - val_accuracy: 0.8374 - val_f1: 0.8386 - val_recall: 0.8374 - val_precision: 0.8423
Epoch 30/30
107/107 [==============================] - 18s 168ms/step
 — val_f1: 0.826925 — val_precision: 0.834585 — val_recall: 0.825037

Epoch 30: val_accuracy did not improve from 0.83741
62/62 [==============================] - 158s 3s/step - loss: 0.0033 - accuracy: 0.9992 - val_loss: 0.9261 - val_accuracy: 0.8250 - val_f1: 0.8269 - val_recall: 0.8250 - val_precision: 0.8346

Evaluate performance of BiLSTM + deep self-attention + MLP model on dev data¶

In [24]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM, Embedding, Input
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from tensorflow.keras import Model
from sklearn.metrics import classification_report

LSTM_SIZE = 300
DENSE = 1000

with tf.device('/device:GPU:0'):

  inputs = Input((MAX_SEQUENCE_LENGTH,))
  embeddings = Embedding(MAX_WORDS+2,EMBEDDING_DIM,# weights=[embedding_matrix],
                      input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=True)(inputs)  #trainable=False)(inputs)
  bilstm = Bidirectional(LSTM(units=LSTM_SIZE, return_sequences=True,recurrent_dropout = 0.33))(embeddings)
  x, attn = DeepAttention(return_attention=True)(bilstm)
  hidden = Dense(units=DENSE, activation="relu")(x)
  out = Dense(units=len(twenty_train.target_names), activation="sigmoid")(hidden)
  model2 = Model(inputs, out)

  # Load weights from the pre-trained model
  model2.load_weights("/content/checkpoints/BiLSTM_attn.hdf5")

  print("Dev set performance")
  predictions_val = np.argmax(model2.predict(val_data), -1)
  print(classification_report(y_val, predictions_val, target_names=twenty_train.target_names))

  print()
  print("Training set performance") # Overfitting on training set
  predictions_train = np.argmax(model2.predict(train_data), -1)
  print(classification_report(y_train, predictions_train, target_names=twenty_train.target_names))
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
Dev set performance
107/107 [==============================] - 21s 191ms/step
                          precision    recall  f1-score   support

             alt.atheism       0.90      0.87      0.89       160
           comp.graphics       0.69      0.79      0.74       165
 comp.os.ms-windows.misc       0.83      0.85      0.84       189
comp.sys.ibm.pc.hardware       0.63      0.72      0.67       168
   comp.sys.mac.hardware       0.87      0.75      0.80       182
          comp.windows.x       0.94      0.88      0.90       168
            misc.forsale       0.72      0.71      0.71       182
               rec.autos       0.80      0.78      0.79       181
         rec.motorcycles       0.85      0.86      0.86       184
      rec.sport.baseball       0.93      0.91      0.92       169
        rec.sport.hockey       0.96      0.90      0.93       175
               sci.crypt       0.93      0.94      0.94       177
         sci.electronics       0.70      0.79      0.74       173
                 sci.med       0.92      0.81      0.86       181
               sci.space       0.88      0.88      0.88       181
  soc.religion.christian       0.77      0.88      0.82       177
      talk.politics.guns       0.88      0.93      0.90       177
   talk.politics.mideast       0.95      0.87      0.91       170
      talk.politics.misc       0.83      0.78      0.80       135
      talk.religion.misc       0.76      0.71      0.73       101

                accuracy                           0.83      3395
               macro avg       0.84      0.83      0.83      3395
            weighted avg       0.84      0.83      0.83      3395


Training set performance
248/248 [==============================] - 42s 170ms/step
                          precision    recall  f1-score   support

             alt.atheism       1.00      1.00      1.00       320
           comp.graphics       1.00      1.00      1.00       419
 comp.os.ms-windows.misc       1.00      1.00      1.00       402
comp.sys.ibm.pc.hardware       1.00      1.00      1.00       422
   comp.sys.mac.hardware       1.00      1.00      1.00       396
          comp.windows.x       1.00      1.00      1.00       425
            misc.forsale       1.00      1.00      1.00       403
               rec.autos       1.00      1.00      1.00       413
         rec.motorcycles       1.00      1.00      1.00       414
      rec.sport.baseball       1.00      1.00      1.00       428
        rec.sport.hockey       1.00      1.00      1.00       425
               sci.crypt       1.00      1.00      1.00       418
         sci.electronics       1.00      1.00      1.00       418
                 sci.med       1.00      1.00      1.00       413
               sci.space       1.00      1.00      1.00       412
  soc.religion.christian       0.96      1.00      0.98       422
      talk.politics.guns       1.00      1.00      1.00       369
   talk.politics.mideast       1.00      0.96      0.98       394
      talk.politics.misc       1.00      1.00      1.00       330
      talk.religion.misc       1.00      1.00      1.00       276

                accuracy                           1.00      7919
               macro avg       1.00      1.00      1.00      7919
            weighted avg       1.00      1.00      1.00      7919

In [25]:
#from sklearn.metrics import accuracy_score
predictions = np.argmax(model2.predict(val_data), -1)
print(f'Validation Accuracy: {accuracy_score(y_val, predictions)*100:.2f}%')

predictions = np.argmax(model2.predict(test_data), -1)
print(f'Test Accuracy:{accuracy_score(y_test, predictions)*100:.2f}%')
107/107 [==============================] - 18s 169ms/step
Validation Accuracy: 83.33%
32/32 [==============================] - 6s 171ms/step
Test Accuracy:71.70%

Resources¶

  • https://github.com/keras-team/keras/tree/master/examples
  • https://keras.io/