!nvidia-smi
Mon Nov 13 19:50:11 2023 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 | | N/A 64C P8 10W / 70W | 0MiB / 15360MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
import gensim.downloader as api
word2vec = api.load('word2vec-google-news-300')
[==================================================] 100.0% 1662.8/1662.8MB downloaded
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train') #, remove=('headers', 'footers', 'quotes'))
print("Catergories")
print(twenty_train.target_names)
print("-------------")
print("First dataset's sample")
print("\n".join(twenty_train.data[0].split("\n")))
print("------------")
print("First dataset's sample category: ",twenty_train.target[0])
Catergories ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] ------------- First dataset's sample From: lerxst@wam.umd.edu (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- ------------ First dataset's sample category: 7
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(twenty_train.data, twenty_train.target, test_size=0.3, random_state=12547392)
twenty_train = fetch_20newsgroups(subset='test')
X_test, y_test = twenty_train.data[:1000], twenty_train.target[:1000]
print('Train samples: {}'.format(len(X_train)))
print('Val samples: {}'.format(len(X_val)))
Train samples: 7919 Val samples: 3395
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm
nlp = spacy.load('en_core_web_sm',disable=["tagger", "parser","ner"])
nlp.add_pipe('sentencizer')
def tokenize_samples(samples):
tokenized_samples = []
for i in tqdm(range(len(samples))):
doc = nlp(samples[i]) # Tokenize the sample into sentences
tokens = []
for sent in doc.sents:
for tok in sent: # Iterate through the words of the sentence
if '\n' in tok.text or "\t" in tok.text or "--" in tok.text or "*" in tok.text or tok.text.lower() in STOP_WORDS:
continue
if tok.text.strip():
tokens.append(tok.text.replace('"',"'").strip())
tokenized_samples.append(tokens)
return tokenized_samples
X_train_tokenized = tokenize_samples(X_train)
X_val_tokenized = tokenize_samples(X_val)
X_test_tokenized = tokenize_samples(X_test)
0%| | 0/7919 [00:00<?, ?it/s]/usr/local/lib/python3.10/dist-packages/spacy/pipeline/lemmatizer.py:211: UserWarning: [W108] The rule-based lemmatizer did not find POS annotation for one or more tokens. Check that your pipeline includes components that assign token.pos, typically 'tagger'+'attribute_ruler' or 'morphologizer'. warnings.warn(Warnings.W108) 100%|██████████| 7919/7919 [04:53<00:00, 26.99it/s] 100%|██████████| 3395/3395 [01:38<00:00, 34.49it/s] 100%|██████████| 1000/1000 [00:25<00:00, 39.01it/s]
import numpy as np
# Get mean and std for length on training set
print('Average length of smples: {}'.format(np.mean([len(x) for x in X_train_tokenized])))
print('Std length of samples: {}'.format(np.std([len(x) for x in X_train_tokenized])))
print('#Samples with length > 1000: {} \n'.format(np.sum([len(x) > 1000 for x in X_train_tokenized])))
print('X_example: {}'.format(X_train_tokenized[0]))
Average length of smples: 240.10670539209497 Std length of samples: 457.42933996960267 #Samples with length > 1000: 178 X_example: [':', 'kastle@wpi', '.', 'WPI.EDU', '(', 'Jacques', 'W', 'Brouillette', ')', 'Subject', ':', ':', 'WARNING', '.....', '(please', 'read', ')', '...', 'Organization', ':', 'Worcester', 'Polytechnic', 'Institute', 'Lines', ':', '8', 'Distribution', ':', 'world', 'NNTP', '-', 'Posting', '-', 'Host', ':', 'wpi.wpi.edu', 'Keywords', ':', 'BRICK', ',', 'TRUCK', ',', 'DANGER', 'plase', 'cease', 'discussion', '.', 'fail', 'people', 'feel', 'need', 'expound', 'issue', 'days', 'days', 'end', '.', 'areas', 'meant', 'type', 'discussion', '.', 'feel', 'need', 'things', ',', 'thought', '.', 'Thanks', '.', ':', 'want', 'things', 'world', ',', '58', 'Plymouth', 'small', ':', ':', 'OPEC', 'nation', 'fuel', '.', 'good', ':', ':', 'thing', '.', 'Car', 'Smashers', 'home', 'sulk', '.', ':', ':', 'Jacques', 'Brouillette', 'Manufacturing', 'Engineering', ':']
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
target_list = twenty_train.target_names
y_train_1_hot = lb.fit_transform([target_list[x] for x in y_train])
y_val_1_hot = lb.transform([target_list[x] for x in y_val])
y_test_1_hot = lb.transform([target_list[x] for x in y_test])
print('Y_example: {}'.format(y_train_1_hot[0]))
Y_example: [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
import tensorflow as tf
from sklearn.metrics import f1_score, recall_score, precision_score
import numpy as np
import os
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, valid_data):
super(Metrics, self).__init__()
self.validation_data = valid_data
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
val_targ = self.validation_data[1]
if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
val_targ = np.argmax(val_targ, -1)
val_targ = tf.cast(val_targ,dtype=tf.float32)
_val_f1 = f1_score(val_targ, val_predict,average="weighted")
_val_recall = recall_score(val_targ, val_predict,average="weighted")
_val_precision = precision_score(val_targ, val_predict,average="weighted")
logs['val_f1'] = _val_f1
logs['val_recall'] = _val_recall
logs['val_precision'] = _val_precision
print(" — val_f1: %f — val_precision: %f — val_recall: %f" % (_val_f1, _val_precision, _val_recall))
return
Two different options:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_SEQUENCE_LENGTH = 250
MAX_WORDS = 100000
# Init tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='__UNK__')
# num_words: the maximum number of words to keep, based on word frequency.
# oov_token: will be used to replace OOV WORDS
# Fit tokenizer (Updates internal vocabulary based on a list of texts.)
tokenizer.fit_on_texts([" ".join(x) for x in X_train_tokenized])
# Converts text to sequences of IDs
train_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_train_tokenized])
val_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_val_tokenized])
test_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in X_test_tokenized])
train_data = pad_sequences(train_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
val_data = pad_sequences(val_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_data = pad_sequences(test_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(train_data[3049])
[28830 28831 11510 7 6 65 258 9 34569 1956 402 8 888 15 202 132 736 2368 190 98 70641 1894 677 4 21747 48 1357 4 43001 106 21747 13 1077 87 39 599 506 2085 7867 15404 83 533 61 44 72 21747 48 1357 4 443 3757 3756 4757 498 1462 2887 5225 5225 506 2085 7867 809 6540 70642 407 70643 70644 16057 34570 27866 70645 16057 34570 506 2085 7867 13666 34570 2804 506 2085 7867 20615 70646 15380 146 8562 506 2085 7867 56 70647 146 8441 42 34571 1087 522 2172 19315 1406 70648 18101 2741 2741 670 689 1923 567 135 5090 158 917 28830 11510 7 281 156 4190 2593 1016 135 39 59 95 258 8771 483 599 156 2772 1016 741 3654 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
word_index = tokenizer.word_index
print('Found {} unique tokens.\n'.format(len(word_index)))
print(f'We will only use {MAX_WORDS} of these')
print(list(word_index.items())[:10])
print('We will filter these out')
print(list(word_index.items())[MAX_WORDS+5:MAX_WORDS+20])
print(len(list(word_index.items())))
Found 101458 unique tokens. We will only use 100000 of these [('__UNK__', 1), ("'", 2), ("'ax", 3), ('edu', 4), ('1', 5), ('subject', 6), ('com', 7), ('lines', 8), ('organization', 9), ('2', 10)] We will filter these out [('19420', 100006), ('homestead', 100007), ('43lt', 100008), ('9974', 100009), ('hpinddh', 100010), ('ryam', 100011), ('40ppg', 100012), ('40shg', 100013), ('devorski', 100014), ('vines', 100015), ('winmain', 100016), ('wndproc', 100017), ('preload', 100018), ('53679', 100019), ('rw1031', 100020)] 101458
# We have saved all the word2vec embeddings to a NumPy array for quick access
# Now, we save the word2vec embeddings for the words that we only need, based on the tokenizer
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((MAX_WORDS+2, EMBEDDING_DIM)) # +2 (pad, unkown)
# PAD is vector zero
# UNK is also vector zero
count = 0
for word, i in word_index.items():
if i > MAX_WORDS:
continue
try:
embedding_matrix[i] = word2vec[word]
count+=1
except:
pass
#Return the number of the elements that are non-zero.
print(len(np.unique(np.nonzero(embedding_matrix)[0])))
print(count)
37785 37785
# # Free RAM
del word2vec
del word_index
del tokenizer
# Maybe i need to delete the old checkpoints??
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GRU, Embedding
from tensorflow.keras.optimizers import Adam
BATCH_SIZE=256
EPOCHS=30
GRU_SIZE = 64
DENSE = 32
MAX_WORDS = 100000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 250
# create an empty sequential model
model = Sequential()
# Αdd an embedding layer
# Randomly Initialized custom embedding layer trained end2end with the model
# model.add(Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM,
# input_length=MAX_SEQUENCE_LENGTH,mask_zero=True, trainable=True))
# Word2vec initialized embedding layer, not trainable
model.add(Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,mask_zero=True, trainable=False))
# Could we try to train the Word2Vec embedding further??? What should we do to do this successfully?
# Hint: learning rate
# Αdd a bidirectional gru layer with 0.33 variational (recurrent) dropout
model.add(Bidirectional(GRU(GRU_SIZE, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), return_sequences=False, recurrent_dropout = 0.33)))
# return_sequences=False: Whether to return the last output in the output sequence, or the full sequence.
# Αdd a hidden MLP layer
model.add(Dropout(0.33))
model.add(Dense(DENSE, activation='relu' ))
# Αdd the output MLP layer
model.add(Dropout(0.33))
model.add(Dense(len(twenty_train.target_names), activation='softmax'))
# Multi-class classification -> Use softmax over all possible classes
# model.build((None, EMBEDDING_DIM, VECTOR_DIMENSION))
print(model.summary())
model.compile(loss='categorical_crossentropy',
optimizer=Adam(learning_rate=0.001),
metrics=["accuracy"])
# Save model weights after each epoch with ModelCheckpoint
# IF I WANTED TO USE GDRIVE
# '/content/gdrive/My Drive/checkpoints'
if not os.path.exists('/content/checkpoints'):
os.makedirs('/content/checkpoints')
# '/content/gdrive/My Drive/checkpoints/BiGRUMLP.hdf5'
checkpoint = ModelCheckpoint('/content/checkpoints/BiGRUMLP.hdf5',
monitor='val_accuracy',
mode='max', verbose=2,
save_best_only=True,
save_weights_only=True)
history = model.fit(train_data,
y_train_1_hot,
validation_data=(val_data, y_val_1_hot),
batch_size=BATCH_SIZE,
epochs=EPOCHS,
shuffle=True,
callbacks=[Metrics(valid_data=(val_data, y_val_1_hot)),
checkpoint])
WARNING:tensorflow:Layer gru will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer gru will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer gru will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 250, 300) 30000600 bidirectional (Bidirection (None, 128) 140544 al) dropout (Dropout) (None, 128) 0 dense (Dense) (None, 32) 4128 dropout_1 (Dropout) (None, 32) 0 dense_1 (Dense) (None, 20) 660 ================================================================= Total params: 30145932 (115.00 MB) Trainable params: 145332 (567.70 KB) Non-trainable params: 30000600 (114.44 MB) _________________________________________________________________ None Epoch 1/30 107/107 [==============================] - 19s 174ms/step — val_f1: 0.042923 — val_precision: 0.105574 — val_recall: 0.076878 Epoch 1: val_accuracy improved from -inf to 0.07688, saving model to /content/checkpoints/BiGRUMLP.hdf5
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
31/31 [==============================] - 97s 3s/step - loss: 2.9931 - accuracy: 0.0596 - val_loss: 2.9764 - val_accuracy: 0.0769 - val_f1: 0.0429 - val_recall: 0.0769 - val_precision: 0.1056 Epoch 2/30 107/107 [==============================] - 16s 151ms/step — val_f1: 0.058762 — val_precision: 0.138320 — val_recall: 0.097791 Epoch 2: val_accuracy improved from 0.07688 to 0.09779, saving model to /content/checkpoints/BiGRUMLP.hdf5
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
31/31 [==============================] - 86s 3s/step - loss: 2.9590 - accuracy: 0.0876 - val_loss: 2.9334 - val_accuracy: 0.0978 - val_f1: 0.0588 - val_recall: 0.0978 - val_precision: 0.1383 Epoch 3/30 107/107 [==============================] - 17s 155ms/step — val_f1: 0.102934 — val_precision: 0.263729 — val_recall: 0.159647 Epoch 3: val_accuracy improved from 0.09779 to 0.15965, saving model to /content/checkpoints/BiGRUMLP.hdf5
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
31/31 [==============================] - 83s 3s/step - loss: 2.8666 - accuracy: 0.1255 - val_loss: 2.6401 - val_accuracy: 0.1596 - val_f1: 0.1029 - val_recall: 0.1596 - val_precision: 0.2637 Epoch 4/30 107/107 [==============================] - 16s 154ms/step — val_f1: 0.200691 — val_precision: 0.316945 — val_recall: 0.227982 Epoch 4: val_accuracy improved from 0.15965 to 0.22798, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 83s 3s/step - loss: 2.5416 - accuracy: 0.1666 - val_loss: 2.3098 - val_accuracy: 0.2280 - val_f1: 0.2007 - val_recall: 0.2280 - val_precision: 0.3169 Epoch 5/30 107/107 [==============================] - 17s 161ms/step — val_f1: 0.271968 — val_precision: 0.351335 — val_recall: 0.293962 Epoch 5: val_accuracy improved from 0.22798 to 0.29396, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 87s 3s/step - loss: 2.3186 - accuracy: 0.2009 - val_loss: 2.0440 - val_accuracy: 0.2940 - val_f1: 0.2720 - val_recall: 0.2940 - val_precision: 0.3513 Epoch 6/30 107/107 [==============================] - 17s 155ms/step — val_f1: 0.302932 — val_precision: 0.373340 — val_recall: 0.344330 Epoch 6: val_accuracy improved from 0.29396 to 0.34433, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 84s 3s/step - loss: 2.1012 - accuracy: 0.2470 - val_loss: 1.8731 - val_accuracy: 0.3443 - val_f1: 0.3029 - val_recall: 0.3443 - val_precision: 0.3733 Epoch 7/30 107/107 [==============================] - 16s 154ms/step — val_f1: 0.355699 — val_precision: 0.387914 — val_recall: 0.377320 Epoch 7: val_accuracy improved from 0.34433 to 0.37732, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 83s 3s/step - loss: 1.9350 - accuracy: 0.3004 - val_loss: 1.7349 - val_accuracy: 0.3773 - val_f1: 0.3557 - val_recall: 0.3773 - val_precision: 0.3879 Epoch 8/30 107/107 [==============================] - 23s 215ms/step — val_f1: 0.397076 — val_precision: 0.428706 — val_recall: 0.425331 Epoch 8: val_accuracy improved from 0.37732 to 0.42533, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 103s 3s/step - loss: 1.8202 - accuracy: 0.3297 - val_loss: 1.5829 - val_accuracy: 0.4253 - val_f1: 0.3971 - val_recall: 0.4253 - val_precision: 0.4287 Epoch 9/30 107/107 [==============================] - 31s 287ms/step — val_f1: 0.434459 — val_precision: 0.457162 — val_recall: 0.455376 Epoch 9: val_accuracy improved from 0.42533 to 0.45538, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 119s 4s/step - loss: 1.7234 - accuracy: 0.3682 - val_loss: 1.5122 - val_accuracy: 0.4554 - val_f1: 0.4345 - val_recall: 0.4554 - val_precision: 0.4572 Epoch 10/30 107/107 [==============================] - 23s 219ms/step — val_f1: 0.459257 — val_precision: 0.470721 — val_recall: 0.475700 Epoch 10: val_accuracy improved from 0.45538 to 0.47570, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 122s 4s/step - loss: 1.6063 - accuracy: 0.4030 - val_loss: 1.4651 - val_accuracy: 0.4757 - val_f1: 0.4593 - val_recall: 0.4757 - val_precision: 0.4707 Epoch 11/30 107/107 [==============================] - 18s 164ms/step — val_f1: 0.482110 — val_precision: 0.506205 — val_recall: 0.500736 Epoch 11: val_accuracy improved from 0.47570 to 0.50074, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 102s 3s/step - loss: 1.5334 - accuracy: 0.4247 - val_loss: 1.3827 - val_accuracy: 0.5007 - val_f1: 0.4821 - val_recall: 0.5007 - val_precision: 0.5062 Epoch 12/30 107/107 [==============================] - 17s 158ms/step — val_f1: 0.504738 — val_precision: 0.531669 — val_recall: 0.536082 Epoch 12: val_accuracy improved from 0.50074 to 0.53608, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 103s 3s/step - loss: 1.4398 - accuracy: 0.4653 - val_loss: 1.3111 - val_accuracy: 0.5361 - val_f1: 0.5047 - val_recall: 0.5361 - val_precision: 0.5317 Epoch 13/30 107/107 [==============================] - 18s 169ms/step — val_f1: 0.560080 — val_precision: 0.592481 — val_recall: 0.572018 Epoch 13: val_accuracy improved from 0.53608 to 0.57202, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 85s 3s/step - loss: 1.3459 - accuracy: 0.5083 - val_loss: 1.2334 - val_accuracy: 0.5720 - val_f1: 0.5601 - val_recall: 0.5720 - val_precision: 0.5925 Epoch 14/30 107/107 [==============================] - 17s 160ms/step — val_f1: 0.573047 — val_precision: 0.593837 — val_recall: 0.584094 Epoch 14: val_accuracy improved from 0.57202 to 0.58409, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 80s 3s/step - loss: 1.2944 - accuracy: 0.5225 - val_loss: 1.1822 - val_accuracy: 0.5841 - val_f1: 0.5730 - val_recall: 0.5841 - val_precision: 0.5938 Epoch 15/30 107/107 [==============================] - 16s 151ms/step — val_f1: 0.593962 — val_precision: 0.605614 — val_recall: 0.609720 Epoch 15: val_accuracy improved from 0.58409 to 0.60972, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 83s 3s/step - loss: 1.2115 - accuracy: 0.5558 - val_loss: 1.1312 - val_accuracy: 0.6097 - val_f1: 0.5940 - val_recall: 0.6097 - val_precision: 0.6056 Epoch 16/30 107/107 [==============================] - 18s 166ms/step — val_f1: 0.587618 — val_precision: 0.598173 — val_recall: 0.609426 Epoch 16: val_accuracy did not improve from 0.60972 31/31 [==============================] - 81s 3s/step - loss: 1.1470 - accuracy: 0.5874 - val_loss: 1.1292 - val_accuracy: 0.6094 - val_f1: 0.5876 - val_recall: 0.6094 - val_precision: 0.5982 Epoch 17/30 107/107 [==============================] - 17s 159ms/step — val_f1: 0.606549 — val_precision: 0.616264 — val_recall: 0.626510 Epoch 17: val_accuracy improved from 0.60972 to 0.62651, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 97s 3s/step - loss: 1.1487 - accuracy: 0.5826 - val_loss: 1.0850 - val_accuracy: 0.6265 - val_f1: 0.6065 - val_recall: 0.6265 - val_precision: 0.6163 Epoch 18/30 107/107 [==============================] - 18s 171ms/step — val_f1: 0.617299 — val_precision: 0.628579 — val_recall: 0.631517 Epoch 18: val_accuracy improved from 0.62651 to 0.63152, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 100s 3s/step - loss: 1.1071 - accuracy: 0.6027 - val_loss: 1.0631 - val_accuracy: 0.6315 - val_f1: 0.6173 - val_recall: 0.6315 - val_precision: 0.6286 Epoch 19/30 107/107 [==============================] - 21s 197ms/step — val_f1: 0.631887 — val_precision: 0.663628 — val_recall: 0.645655 Epoch 19: val_accuracy improved from 0.63152 to 0.64566, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 112s 4s/step - loss: 1.0393 - accuracy: 0.6208 - val_loss: 1.0454 - val_accuracy: 0.6457 - val_f1: 0.6319 - val_recall: 0.6457 - val_precision: 0.6636 Epoch 20/30 107/107 [==============================] - 21s 201ms/step — val_f1: 0.643488 — val_precision: 0.651097 — val_recall: 0.656554 Epoch 20: val_accuracy improved from 0.64566 to 0.65655, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 101s 3s/step - loss: 1.0380 - accuracy: 0.6314 - val_loss: 1.0332 - val_accuracy: 0.6566 - val_f1: 0.6435 - val_recall: 0.6566 - val_precision: 0.6511 Epoch 21/30 107/107 [==============================] - 17s 157ms/step — val_f1: 0.670318 — val_precision: 0.675014 — val_recall: 0.677761 Epoch 21: val_accuracy improved from 0.65655 to 0.67776, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 98s 3s/step - loss: 0.9716 - accuracy: 0.6468 - val_loss: 0.9956 - val_accuracy: 0.6778 - val_f1: 0.6703 - val_recall: 0.6778 - val_precision: 0.6750 Epoch 22/30 107/107 [==============================] - 17s 156ms/step — val_f1: 0.679968 — val_precision: 0.677389 — val_recall: 0.692489 Epoch 22: val_accuracy improved from 0.67776 to 0.69249, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 85s 3s/step - loss: 0.9370 - accuracy: 0.6646 - val_loss: 0.9413 - val_accuracy: 0.6925 - val_f1: 0.6800 - val_recall: 0.6925 - val_precision: 0.6774 Epoch 23/30 107/107 [==============================] - 18s 171ms/step — val_f1: 0.689051 — val_precision: 0.698551 — val_recall: 0.697496 Epoch 23: val_accuracy improved from 0.69249 to 0.69750, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 94s 3s/step - loss: 0.8957 - accuracy: 0.6800 - val_loss: 0.9263 - val_accuracy: 0.6975 - val_f1: 0.6891 - val_recall: 0.6975 - val_precision: 0.6986 Epoch 24/30 107/107 [==============================] - 17s 155ms/step — val_f1: 0.687034 — val_precision: 0.695312 — val_recall: 0.693667 Epoch 24: val_accuracy did not improve from 0.69750 31/31 [==============================] - 88s 3s/step - loss: 0.8759 - accuracy: 0.6888 - val_loss: 0.9442 - val_accuracy: 0.6937 - val_f1: 0.6870 - val_recall: 0.6937 - val_precision: 0.6953 Epoch 25/30 107/107 [==============================] - 17s 160ms/step — val_f1: 0.701646 — val_precision: 0.705686 — val_recall: 0.706038 Epoch 25: val_accuracy improved from 0.69750 to 0.70604, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 82s 3s/step - loss: 0.8670 - accuracy: 0.6944 - val_loss: 0.9070 - val_accuracy: 0.7060 - val_f1: 0.7016 - val_recall: 0.7060 - val_precision: 0.7057 Epoch 26/30 107/107 [==============================] - 17s 154ms/step — val_f1: 0.696898 — val_precision: 0.703126 — val_recall: 0.708100 Epoch 26: val_accuracy improved from 0.70604 to 0.70810, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 85s 3s/step - loss: 0.8353 - accuracy: 0.7034 - val_loss: 0.9126 - val_accuracy: 0.7081 - val_f1: 0.6969 - val_recall: 0.7081 - val_precision: 0.7031 Epoch 27/30 107/107 [==============================] - 16s 154ms/step — val_f1: 0.710743 — val_precision: 0.719713 — val_recall: 0.718704 Epoch 27: val_accuracy improved from 0.70810 to 0.71870, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 83s 3s/step - loss: 0.7977 - accuracy: 0.7219 - val_loss: 0.8898 - val_accuracy: 0.7187 - val_f1: 0.7107 - val_recall: 0.7187 - val_precision: 0.7197 Epoch 28/30 107/107 [==============================] - 16s 151ms/step — val_f1: 0.712709 — val_precision: 0.718023 — val_recall: 0.723417 Epoch 28: val_accuracy improved from 0.71870 to 0.72342, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 81s 3s/step - loss: 0.7834 - accuracy: 0.7247 - val_loss: 0.9059 - val_accuracy: 0.7234 - val_f1: 0.7127 - val_recall: 0.7234 - val_precision: 0.7180 Epoch 29/30 107/107 [==============================] - 16s 150ms/step — val_f1: 0.707936 — val_precision: 0.723141 — val_recall: 0.716053 Epoch 29: val_accuracy did not improve from 0.72342 31/31 [==============================] - 78s 3s/step - loss: 0.7389 - accuracy: 0.7399 - val_loss: 0.9172 - val_accuracy: 0.7163 - val_f1: 0.7079 - val_recall: 0.7161 - val_precision: 0.7231 Epoch 30/30 107/107 [==============================] - 16s 148ms/step — val_f1: 0.715801 — val_precision: 0.721311 — val_recall: 0.724300 Epoch 30: val_accuracy improved from 0.72342 to 0.72430, saving model to /content/checkpoints/BiGRUMLP.hdf5 31/31 [==============================] - 80s 3s/step - loss: 0.7377 - accuracy: 0.7424 - val_loss: 0.8948 - val_accuracy: 0.7243 - val_f1: 0.7158 - val_recall: 0.7243 - val_precision: 0.7213
%matplotlib inline
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, GRU
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
GRU_SIZE = 64
DENSE = 32
with tf.device('/device:GPU:0'):
model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,mask_zero=True, trainable=False))
model.add(Bidirectional(GRU(GRU_SIZE, return_sequences=False, recurrent_dropout = 0.33)))
model.add(Dense(DENSE, activation='relu' ))
model.add(Dense(len(twenty_train.target_names), activation='softmax'))
# Load weights from the pre-trained model
model.load_weights("/content/checkpoints/BiGRUMLP.hdf5")
# model.load_weights("/content/gdrive/My Drive/checkpoints/BiGRUMLP.hdf5")
predictions = np.argmax(model.predict(val_data), -1)
print(classification_report(y_val, predictions, target_names=twenty_train.target_names))
WARNING:tensorflow:Layer gru_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer gru_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer gru_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
107/107 [==============================] - 17s 152ms/step precision recall f1-score support alt.atheism 0.68 0.68 0.68 160 comp.graphics 0.39 0.28 0.33 165 comp.os.ms-windows.misc 0.73 0.74 0.74 189 comp.sys.ibm.pc.hardware 0.48 0.40 0.44 168 comp.sys.mac.hardware 0.59 0.56 0.57 182 comp.windows.x 0.56 0.79 0.66 168 misc.forsale 0.65 0.66 0.66 182 rec.autos 0.78 0.74 0.76 181 rec.motorcycles 0.93 0.74 0.83 184 rec.sport.baseball 0.93 0.84 0.88 169 rec.sport.hockey 0.89 0.91 0.90 175 sci.crypt 0.85 0.93 0.89 177 sci.electronics 0.61 0.71 0.66 173 sci.med 0.86 0.90 0.88 181 sci.space 0.94 0.88 0.91 181 soc.religion.christian 0.63 0.90 0.74 177 talk.politics.guns 0.72 0.85 0.78 177 talk.politics.mideast 0.83 0.94 0.88 170 talk.politics.misc 0.78 0.56 0.66 135 talk.religion.misc 0.44 0.16 0.23 101 accuracy 0.72 3395 macro avg 0.71 0.71 0.70 3395 weighted avg 0.72 0.72 0.72 3395
from sklearn.metrics import accuracy_score
predictions = np.argmax(model.predict(val_data), -1)
print(f'Validation Accuracy: {accuracy_score(y_val, predictions)*100:.2f}%')
predictions = np.argmax(model.predict(test_data), -1)
print(f'Test Accuracy:{accuracy_score(y_test, predictions)*100:.2f}%')
107/107 [==============================] - 18s 163ms/step Validation Accuracy: 72.43% 32/32 [==============================] - 5s 145ms/step Test Accuracy:67.10%
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer
import numpy as np
def dot_product(x, kernel):
"""
Wrapper for dot product operation, in order to be compatible with both
Theano and Tensorflow
Args:
x (): input
kernel (): weights
Returns:
"""
if K.backend() == 'tensorflow':
return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
else:
return K.dot(x, kernel)
class LinearAttention(Layer):
def __init__(self,
kernel_regularizer=None, bias_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True,
return_attention=False,
**kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
#apply penalties on layer parameters or layer activity during optimization.
#These penalties are summed into the loss function that the network optimizes.
self.W_regularizer = regularizers.get(kernel_regularizer)
self.b_regularizer = regularizers.get(bias_regularizer)
#setting constraints (eg. non-negativity) on model parameters during training.
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.return_attention = return_attention
super(LinearAttention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
#input_shape[-1] = 600
self.W = self.add_weight(shape=(input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
if self.bias:
self.b = self.add_weight(shape=(1,),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, inputs, mask=None):
# do not pass the mask to the next layers
if self.return_attention:
return [None, None]
return None
def call(self, x, mask=None):
# eij = Wx + b
eij = dot_product(x, self.W)
if self.bias:
eij += self.b
# Apply mask
if mask is not None:
eij *= K.cast(mask, K.floatx())
# a = softmax(eij)
# shape_of_a = 1000 (sequence length/time steps)
a = K.expand_dims(K.softmax(eij, axis=-1))
# position wise multiplication -> shape = 600x1000
weighted_input = x * a
# shape -> 600
result = K.sum(weighted_input, axis=1)
if self.return_attention:
return [result, a]
return result
def compute_output_shape(self, input_shape):
if self.return_attention:
#input_shape[-1] -> 600
#input_shape[1]-> 1000
#input_shape[0] -> batch_size/number of samples
return [(input_shape[0], input_shape[-1]),
(input_shape[0], input_shape[1])]
else:
return input_shape[0], input_shape[-1]
class DeepAttention(Layer):
def __init__(self,
kernel_regularizer=None, u_regularizer=None, bias_regularizer=None,
W_constraint=None, u_constraint=None, b_constraint=None,
bias=True,
return_attention=False,
**kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(kernel_regularizer)
self.u_regularizer = regularizers.get(u_regularizer)
self.b1_regularizer = regularizers.get(bias_regularizer)
self.b2_regularizer = regularizers.get(bias_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.u_constraint = constraints.get(u_constraint)
self.b1_constraint = constraints.get(b_constraint)
self.b2_constraint = constraints.get(b_constraint)
self.bias = bias
self.return_attention = return_attention
super(DeepAttention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
if self.bias:
self.b1 = self.add_weight(shape=(input_shape[-1],),
initializer='zero',
name='{}_b1'.format(self.name),
regularizer=self.b1_regularizer,
constraint=self.b1_constraint)
self.b2 = self.add_weight(shape=(1,),
initializer='zero',
name='{}_b2'.format(self.name),
regularizer=self.b2_regularizer,
constraint=self.b2_constraint)
else:
self.b1 = None
self.b2 = None
self.u = self.add_weight(shape=(input_shape[-1],),
initializer=self.init,
name='{}_u'.format(self.name),
regularizer=self.u_regularizer,
constraint=self.u_constraint)
self.built = True
def compute_mask(self, inputs, mask=None):
# do not pass the mask to the next layers
if self.return_attention:
return [None, None]
return None
def call(self, x, mask=None):
# uit = tanh(Wx + b)
uit = dot_product(x, self.W)
if self.bias:
uit += self.b1
uit = K.tanh(uit)
# ait = softmax(Ueij)
eij = dot_product(uit, self.u)
if self.bias:
eij += self.b2
# Apply mask
if mask is not None:
eij *= K.cast(mask, K.floatx())
a = K.expand_dims(K.softmax(eij, axis=-1))
weighted_input = x * a
result = K.sum(weighted_input, axis=1)
if self.return_attention:
return [result, a]
return result
def compute_output_shape(self, input_shape):
if self.return_attention:
return [(input_shape[0], input_shape[-1]),
(input_shape[0], input_shape[1])]
else:
return input_shape[0], input_shape[-1]
Architecture |
---|
RNN (BiLSTM) layers |
Attention Mechanism |
MLP layer |
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM, Embedding, Input
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from tensorflow.keras import Model
# !pip install keras-self-attention
# from keras_self_attention import SeqSelfAttention
LSTM_SIZE = 300
DENSE = 1000
with tf.device('/device:GPU:0'):
inputs = Input((MAX_SEQUENCE_LENGTH,))
# Define the Embedding Layer with fastext weights
embeddings = Embedding(input_dim=MAX_WORDS+2, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=False)(inputs)
drop_emb = Dropout(0.33)(embeddings)
# Define a (Biderectional) RNN with LSTM cells
bilstm = Bidirectional(LSTM(units=LSTM_SIZE, return_sequences=True, recurrent_dropout=0.33))(drop_emb)
drop_encodings = Dropout(0.33)(bilstm)
# Pass the encoding through an Attention Layer
x, attn = DeepAttention(return_attention=True)(drop_encodings)
# x, attn = LinearAttention(return_attention=True)(drop_encodings)
# Alternatively use keras package for self-attention
#x, attn = SeqSelfAttention(return_attention=True)(drop_encodings)
# Apply Droupout to the encoding produced by the attention mechanism
drop_x = Dropout(0.33)(x)
# Pass through a Dense Layer
hidden = Dense(units=DENSE, activation="relu")(drop_x)
# Apply Dropout to the output of the Dense Layer
drop_out = Dropout(0.33)(hidden)
# Last pass through a Dense Layer with softmax activation to produce a probability distribution
out = Dense(units=len(twenty_train.target_names), activation="softmax")(drop_out)
# Wrap model --> Remember Functional API
model2 = Model(inputs=inputs, outputs=out)
print(model2.summary())
model2.compile(loss='categorical_crossentropy',
optimizer=Adam(lr=0.001),
metrics=["accuracy"])
if not os.path.exists('/content/checkpoints'):
os.makedirs('/content/checkpoints')
checkpoint = ModelCheckpoint('/content/checkpoints/BiLSTM_attn.hdf5',
monitor='val_accuracy',
mode='max', verbose=2,
save_best_only=True,
save_weights_only=True)
history2 = model2.fit(train_data, y_train_1_hot,
validation_data=(val_data, y_val_1_hot),
batch_size=128,
epochs=30,
shuffle=True,
callbacks=[Metrics(valid_data=(val_data, y_val_1_hot)),
checkpoint])
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. /usr/local/lib/python3.10/dist-packages/keras/src/initializers/initializers.py:120: UserWarning: The initializer GlorotUniform is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initializer instance more than once. warnings.warn(
Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 250)] 0 embedding (Embedding) (None, 250, 300) 30000600 dropout (Dropout) (None, 250, 300) 0 bidirectional (Bidirection (None, 250, 600) 1442400 al) dropout_1 (Dropout) (None, 250, 600) 0 deep_attention (DeepAttent [(None, 600), 361201 ion) (None, 250, 1)] dropout_2 (Dropout) (None, 600) 0 dense (Dense) (None, 1000) 601000 dropout_3 (Dropout) (None, 1000) 0 dense_1 (Dense) (None, 20) 20020 ================================================================= Total params: 32425221 (123.69 MB) Trainable params: 2424621 (9.25 MB) Non-trainable params: 30000600 (114.44 MB) _________________________________________________________________
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.Adam.
None Epoch 1/30 107/107 [==============================] - 22s 198ms/step — val_f1: 0.521783 — val_precision: 0.606270 — val_recall: 0.548748 Epoch 1: val_accuracy improved from -inf to 0.54875, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 173s 3s/step - loss: 1.9925 - accuracy: 0.3237 - val_loss: 1.3148 - val_accuracy: 0.5487 - val_f1: 0.5218 - val_recall: 0.5487 - val_precision: 0.6063 Epoch 2/30 107/107 [==============================] - 20s 191ms/step — val_f1: 0.700132 — val_precision: 0.710411 — val_recall: 0.707511 Epoch 2: val_accuracy improved from 0.54875 to 0.70751, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 175s 3s/step - loss: 1.1947 - accuracy: 0.6001 - val_loss: 0.9568 - val_accuracy: 0.7075 - val_f1: 0.7001 - val_recall: 0.7075 - val_precision: 0.7104 Epoch 3/30 107/107 [==============================] - 20s 187ms/step — val_f1: 0.718428 — val_precision: 0.740916 — val_recall: 0.725479 Epoch 3: val_accuracy improved from 0.70751 to 0.72548, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 158s 3s/step - loss: 0.9418 - accuracy: 0.6962 - val_loss: 0.8615 - val_accuracy: 0.7255 - val_f1: 0.7184 - val_recall: 0.7255 - val_precision: 0.7409 Epoch 4/30 107/107 [==============================] - 21s 199ms/step — val_f1: 0.748557 — val_precision: 0.769423 — val_recall: 0.751105 Epoch 4: val_accuracy improved from 0.72548 to 0.75110, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 178s 3s/step - loss: 0.8189 - accuracy: 0.7301 - val_loss: 0.7762 - val_accuracy: 0.7511 - val_f1: 0.7486 - val_recall: 0.7511 - val_precision: 0.7694 Epoch 5/30 107/107 [==============================] - 21s 193ms/step — val_f1: 0.770434 — val_precision: 0.783276 — val_recall: 0.769956 Epoch 5: val_accuracy improved from 0.75110 to 0.76996, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 160s 3s/step - loss: 0.7321 - accuracy: 0.7586 - val_loss: 0.7425 - val_accuracy: 0.7700 - val_f1: 0.7704 - val_recall: 0.7700 - val_precision: 0.7833 Epoch 6/30 107/107 [==============================] - 21s 199ms/step — val_f1: 0.779297 — val_precision: 0.792297 — val_recall: 0.778498 Epoch 6: val_accuracy improved from 0.76996 to 0.77850, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 179s 3s/step - loss: 0.6830 - accuracy: 0.7693 - val_loss: 0.7044 - val_accuracy: 0.7785 - val_f1: 0.7793 - val_recall: 0.7785 - val_precision: 0.7923 Epoch 7/30 107/107 [==============================] - 21s 195ms/step — val_f1: 0.785049 — val_precision: 0.808186 — val_recall: 0.786156 Epoch 7: val_accuracy improved from 0.77850 to 0.78616, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 178s 3s/step - loss: 0.6185 - accuracy: 0.7933 - val_loss: 0.6838 - val_accuracy: 0.7862 - val_f1: 0.7850 - val_recall: 0.7862 - val_precision: 0.8082 Epoch 8/30 107/107 [==============================] - 22s 202ms/step — val_f1: 0.811010 — val_precision: 0.817126 — val_recall: 0.810604 Epoch 8: val_accuracy improved from 0.78616 to 0.81060, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 159s 3s/step - loss: 0.5771 - accuracy: 0.8092 - val_loss: 0.6231 - val_accuracy: 0.8106 - val_f1: 0.8110 - val_recall: 0.8106 - val_precision: 0.8171 Epoch 9/30 107/107 [==============================] - 20s 189ms/step — val_f1: 0.796695 — val_precision: 0.804522 — val_recall: 0.801178 Epoch 9: val_accuracy did not improve from 0.81060 62/62 [==============================] - 158s 3s/step - loss: 0.5386 - accuracy: 0.8260 - val_loss: 0.6250 - val_accuracy: 0.8012 - val_f1: 0.7967 - val_recall: 0.8012 - val_precision: 0.8045 Epoch 10/30 107/107 [==============================] - 21s 197ms/step — val_f1: 0.821066 — val_precision: 0.827142 — val_recall: 0.819440 Epoch 10: val_accuracy improved from 0.81060 to 0.81944, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 157s 3s/step - loss: 0.4968 - accuracy: 0.8341 - val_loss: 0.5832 - val_accuracy: 0.8194 - val_f1: 0.8211 - val_recall: 0.8194 - val_precision: 0.8271 Epoch 11/30 107/107 [==============================] - 21s 197ms/step — val_f1: 0.814973 — val_precision: 0.823588 — val_recall: 0.817673 Epoch 11: val_accuracy did not improve from 0.81944 62/62 [==============================] - 155s 3s/step - loss: 0.4499 - accuracy: 0.8516 - val_loss: 0.6115 - val_accuracy: 0.8177 - val_f1: 0.8150 - val_recall: 0.8177 - val_precision: 0.8236 Epoch 12/30 107/107 [==============================] - 20s 187ms/step — val_f1: 0.827455 — val_precision: 0.832135 — val_recall: 0.829455 Epoch 12: val_accuracy improved from 0.81944 to 0.82946, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 159s 3s/step - loss: 0.4310 - accuracy: 0.8564 - val_loss: 0.5865 - val_accuracy: 0.8295 - val_f1: 0.8275 - val_recall: 0.8295 - val_precision: 0.8321 Epoch 13/30 107/107 [==============================] - 21s 195ms/step — val_f1: 0.827546 — val_precision: 0.831140 — val_recall: 0.829161 Epoch 13: val_accuracy did not improve from 0.82946 62/62 [==============================] - 156s 3s/step - loss: 0.4049 - accuracy: 0.8621 - val_loss: 0.5805 - val_accuracy: 0.8292 - val_f1: 0.8275 - val_recall: 0.8292 - val_precision: 0.8311 Epoch 14/30 107/107 [==============================] - 21s 195ms/step — val_f1: 0.835334 — val_precision: 0.840626 — val_recall: 0.835935 Epoch 14: val_accuracy improved from 0.82946 to 0.83594, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 154s 2s/step - loss: 0.3655 - accuracy: 0.8752 - val_loss: 0.5747 - val_accuracy: 0.8359 - val_f1: 0.8353 - val_recall: 0.8359 - val_precision: 0.8406 Epoch 15/30 107/107 [==============================] - 21s 192ms/step — val_f1: 0.830022 — val_precision: 0.836516 — val_recall: 0.830044 Epoch 15: val_accuracy did not improve from 0.83594 62/62 [==============================] - 181s 3s/step - loss: 0.3433 - accuracy: 0.8848 - val_loss: 0.5721 - val_accuracy: 0.8300 - val_f1: 0.8300 - val_recall: 0.8300 - val_precision: 0.8365 Epoch 16/30 107/107 [==============================] - 20s 186ms/step — val_f1: 0.832501 — val_precision: 0.838497 — val_recall: 0.832401 Epoch 16: val_accuracy did not improve from 0.83594 62/62 [==============================] - 155s 3s/step - loss: 0.3155 - accuracy: 0.8963 - val_loss: 0.5820 - val_accuracy: 0.8324 - val_f1: 0.8325 - val_recall: 0.8324 - val_precision: 0.8385 Epoch 17/30 107/107 [==============================] - 21s 192ms/step — val_f1: 0.835755 — val_precision: 0.841986 — val_recall: 0.837113 Epoch 17: val_accuracy improved from 0.83594 to 0.83711, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 177s 3s/step - loss: 0.2877 - accuracy: 0.9076 - val_loss: 0.5864 - val_accuracy: 0.8371 - val_f1: 0.8358 - val_recall: 0.8371 - val_precision: 0.8420 Epoch 18/30 107/107 [==============================] - 21s 197ms/step — val_f1: 0.837591 — val_precision: 0.842126 — val_recall: 0.838586 Epoch 18: val_accuracy improved from 0.83711 to 0.83859, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 155s 3s/step - loss: 0.2893 - accuracy: 0.9016 - val_loss: 0.5857 - val_accuracy: 0.8386 - val_f1: 0.8376 - val_recall: 0.8386 - val_precision: 0.8421 Epoch 19/30 107/107 [==============================] - 20s 184ms/step — val_f1: 0.850468 — val_precision: 0.853581 — val_recall: 0.849779 Epoch 19: val_accuracy improved from 0.83859 to 0.84978, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 157s 3s/step - loss: 0.2678 - accuracy: 0.9115 - val_loss: 0.5446 - val_accuracy: 0.8498 - val_f1: 0.8505 - val_recall: 0.8498 - val_precision: 0.8536 Epoch 20/30 107/107 [==============================] - 21s 195ms/step — val_f1: 0.845820 — val_precision: 0.850999 — val_recall: 0.845361 Epoch 20: val_accuracy did not improve from 0.84978 62/62 [==============================] - 156s 3s/step - loss: 0.2422 - accuracy: 0.9202 - val_loss: 0.5782 - val_accuracy: 0.8454 - val_f1: 0.8458 - val_recall: 0.8454 - val_precision: 0.8510 Epoch 21/30 107/107 [==============================] - 21s 197ms/step — val_f1: 0.842152 — val_precision: 0.845579 — val_recall: 0.842710 Epoch 21: val_accuracy did not improve from 0.84978 62/62 [==============================] - 176s 3s/step - loss: 0.2212 - accuracy: 0.9236 - val_loss: 0.6202 - val_accuracy: 0.8427 - val_f1: 0.8422 - val_recall: 0.8427 - val_precision: 0.8456 Epoch 22/30 107/107 [==============================] - 22s 202ms/step — val_f1: 0.846921 — val_precision: 0.853436 — val_recall: 0.847128 Epoch 22: val_accuracy did not improve from 0.84978 62/62 [==============================] - 175s 3s/step - loss: 0.2198 - accuracy: 0.9252 - val_loss: 0.6041 - val_accuracy: 0.8471 - val_f1: 0.8469 - val_recall: 0.8471 - val_precision: 0.8534 Epoch 23/30 107/107 [==============================] - 21s 196ms/step — val_f1: 0.842242 — val_precision: 0.847028 — val_recall: 0.843888 Epoch 23: val_accuracy did not improve from 0.84978 62/62 [==============================] - 174s 3s/step - loss: 0.2055 - accuracy: 0.9342 - val_loss: 0.5751 - val_accuracy: 0.8439 - val_f1: 0.8422 - val_recall: 0.8439 - val_precision: 0.8470 Epoch 24/30 107/107 [==============================] - 21s 195ms/step — val_f1: 0.849301 — val_precision: 0.854896 — val_recall: 0.849485 Epoch 24: val_accuracy did not improve from 0.84978 62/62 [==============================] - 163s 3s/step - loss: 0.1860 - accuracy: 0.9380 - val_loss: 0.5997 - val_accuracy: 0.8495 - val_f1: 0.8493 - val_recall: 0.8495 - val_precision: 0.8549 Epoch 25/30 107/107 [==============================] - 22s 206ms/step — val_f1: 0.849716 — val_precision: 0.855865 — val_recall: 0.848601 Epoch 25: val_accuracy did not improve from 0.84978 62/62 [==============================] - 185s 3s/step - loss: 0.1528 - accuracy: 0.9482 - val_loss: 0.6142 - val_accuracy: 0.8486 - val_f1: 0.8497 - val_recall: 0.8486 - val_precision: 0.8559 Epoch 26/30 107/107 [==============================] - 22s 203ms/step — val_f1: 0.851611 — val_precision: 0.854770 — val_recall: 0.850957 Epoch 26: val_accuracy improved from 0.84978 to 0.85096, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 181s 3s/step - loss: 0.1583 - accuracy: 0.9472 - val_loss: 0.6040 - val_accuracy: 0.8510 - val_f1: 0.8516 - val_recall: 0.8510 - val_precision: 0.8548 Epoch 27/30 107/107 [==============================] - 22s 203ms/step — val_f1: 0.855303 — val_precision: 0.858370 — val_recall: 0.855081 Epoch 27: val_accuracy improved from 0.85096 to 0.85508, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 180s 3s/step - loss: 0.1467 - accuracy: 0.9500 - val_loss: 0.6020 - val_accuracy: 0.8551 - val_f1: 0.8553 - val_recall: 0.8551 - val_precision: 0.8584 Epoch 28/30 107/107 [==============================] - 21s 198ms/step — val_f1: 0.844082 — val_precision: 0.852072 — val_recall: 0.844477 Epoch 28: val_accuracy did not improve from 0.85508 62/62 [==============================] - 179s 3s/step - loss: 0.1445 - accuracy: 0.9510 - val_loss: 0.6639 - val_accuracy: 0.8445 - val_f1: 0.8441 - val_recall: 0.8445 - val_precision: 0.8521 Epoch 29/30 107/107 [==============================] - 21s 200ms/step — val_f1: 0.849131 — val_precision: 0.852903 — val_recall: 0.848601 Epoch 29: val_accuracy did not improve from 0.85508 62/62 [==============================] - 178s 3s/step - loss: 0.1379 - accuracy: 0.9523 - val_loss: 0.6117 - val_accuracy: 0.8486 - val_f1: 0.8491 - val_recall: 0.8486 - val_precision: 0.8529 Epoch 30/30 107/107 [==============================] - 21s 199ms/step — val_f1: 0.857764 — val_precision: 0.861298 — val_recall: 0.857732 Epoch 30: val_accuracy improved from 0.85508 to 0.85773, saving model to /content/checkpoints/BiLSTM_attn.hdf5 62/62 [==============================] - 158s 3s/step - loss: 0.1228 - accuracy: 0.9601 - val_loss: 0.6177 - val_accuracy: 0.8577 - val_f1: 0.8578 - val_recall: 0.8577 - val_precision: 0.8613
%matplotlib inline
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history2.history['accuracy'])
plt.plot(history2.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM, Embedding, Input
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from tensorflow.keras import Model
from sklearn.metrics import classification_report
LSTM_SIZE = 300
DENSE = 1000
with tf.device('/device:GPU:0'):
inputs = Input((MAX_SEQUENCE_LENGTH,))
embeddings = Embedding(MAX_WORDS+2,EMBEDDING_DIM, weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=False)(inputs)
bilstm = Bidirectional(LSTM(units=LSTM_SIZE, return_sequences=True,recurrent_dropout = 0.33))(embeddings)
x, attn = DeepAttention(return_attention=True)(bilstm)
hidden = Dense(units=DENSE, activation="relu")(x)
out = Dense(units=len(twenty_train.target_names), activation="sigmoid")(hidden)
model2 = Model(inputs, out)
# Load weights from the pre-trained model
model2.load_weights("/content/checkpoints/BiLSTM_attn.hdf5")
print("Dev set performance")
predictions_val = np.argmax(model2.predict(val_data), -1)
print(classification_report(y_val, predictions_val, target_names=twenty_train.target_names))
print()
print("Training set performance") # Overfitting on training set
predictions_train = np.argmax(model2.predict(train_data), -1)
print(classification_report(y_train, predictions_train, target_names=twenty_train.target_names))
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. /usr/local/lib/python3.10/dist-packages/keras/src/initializers/initializers.py:120: UserWarning: The initializer GlorotUniform is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initializer instance more than once. warnings.warn(
Dev set performance 107/107 [==============================] - 22s 198ms/step precision recall f1-score support alt.atheism 0.92 0.76 0.83 160 comp.graphics 0.75 0.78 0.76 165 comp.os.ms-windows.misc 0.71 0.88 0.78 189 comp.sys.ibm.pc.hardware 0.67 0.64 0.66 168 comp.sys.mac.hardware 0.76 0.75 0.75 182 comp.windows.x 0.88 0.83 0.85 168 misc.forsale 0.81 0.79 0.80 182 rec.autos 0.94 0.76 0.84 181 rec.motorcycles 0.90 0.90 0.90 184 rec.sport.baseball 0.95 0.95 0.95 169 rec.sport.hockey 0.96 0.97 0.96 175 sci.crypt 0.92 0.95 0.93 177 sci.electronics 0.85 0.83 0.84 173 sci.med 0.98 0.96 0.97 181 sci.space 0.96 0.94 0.95 181 soc.religion.christian 0.80 0.93 0.86 177 talk.politics.guns 0.88 0.95 0.92 177 talk.politics.mideast 0.93 0.91 0.92 170 talk.politics.misc 0.86 0.85 0.86 135 talk.religion.misc 0.76 0.77 0.77 101 accuracy 0.86 3395 macro avg 0.86 0.85 0.86 3395 weighted avg 0.86 0.86 0.86 3395 Training set performance 248/248 [==============================] - 50s 201ms/step precision recall f1-score support alt.atheism 1.00 0.97 0.99 320 comp.graphics 1.00 0.99 0.99 419 comp.os.ms-windows.misc 0.94 1.00 0.97 402 comp.sys.ibm.pc.hardware 1.00 0.96 0.98 422 comp.sys.mac.hardware 1.00 0.99 1.00 396 comp.windows.x 0.99 1.00 1.00 425 misc.forsale 0.99 1.00 1.00 403 rec.autos 1.00 0.99 0.99 413 rec.motorcycles 1.00 1.00 1.00 414 rec.sport.baseball 1.00 1.00 1.00 428 rec.sport.hockey 1.00 1.00 1.00 425 sci.crypt 1.00 1.00 1.00 418 sci.electronics 1.00 0.99 0.99 418 sci.med 1.00 1.00 1.00 413 sci.space 1.00 1.00 1.00 412 soc.religion.christian 0.99 1.00 0.99 422 talk.politics.guns 1.00 1.00 1.00 369 talk.politics.mideast 0.99 1.00 1.00 394 talk.politics.misc 1.00 0.99 1.00 330 talk.religion.misc 0.99 0.99 0.99 276 accuracy 0.99 7919 macro avg 0.99 0.99 0.99 7919 weighted avg 0.99 0.99 0.99 7919
from sklearn.metrics import accuracy_score
predictions = np.argmax(model2.predict(val_data), -1)
print(f'Validation Accuracy: {accuracy_score(y_val, predictions)*100:.2f}%')
predictions = np.argmax(model2.predict(test_data), -1)
print(f'Test Accuracy:{accuracy_score(y_test, predictions)*100:.2f}%')
107/107 [==============================] - 21s 199ms/step Validation Accuracy: 85.77% 32/32 [==============================] - 6s 175ms/step Test Accuracy:79.00%
Model Name | Val Accuracy | Test Accuracy |
---|---|---|
Logistic Regression + TF-IDF | 83.74% | 76.83% |
MLP + TF-IDF | 86.95% | 77.10% |
MLP + Word2Vec Centroids | 79.73% | 70.61% |
############################ | ##### | ##### |
RNN custom embeddings | 78.76% | 67.70% |
RNN Word2Vec | 72.43% | 67.10% |
RNN custom embeddings + self-attention | 83.33% | 71.70% |
RNN Word2Vec + self-attention | 85.77% | 79.00% |