import gensim.downloader as api
word2vec = api.load('word2vec-google-news-300')
import tensorflow.keras.backend as K
def recall(y_true, y_pred):
"""
Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""
Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
Source
------
https://github.com/fchollet/keras/issues/5400#issuecomment-314747992
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def f1(y_true, y_pred):
"""Calculate the F1 score."""
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
return 2 * ((p * r) / (p + r))
def accuracy(y_true, y_pred):
return K.mean(K.equal(y_true, K.round(y_pred)), axis=1)
The Reuters Corpus contains 10,788 news documents totaling 1.3 million words. The documents have been classified into 90 topics, and grouped into two sets, called "training" and "test".
https://www.nltk.org/book/ch02.html
Categories in the Reuters corpus may overlap with each other, simply because a news story often covers multiple topics.
!python3 -m nltk.downloader reuters;
!unzip /root/nltk_data/corpora/reuters.zip -d /root/nltk_data/corpora;
# Load reuter's documents and labels
import nltk
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer # Used for multilabel classification problem
from sklearn.model_selection import train_test_split
mlb = MultiLabelBinarizer()
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
X = [reuters.raw(doc_id) for doc_id in train]
y = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train])
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3, random_state=12547392)
X_test = [reuters.raw(doc_id) for doc_id in test]
y_test = mlb.transform([reuters.categories(doc_id) for doc_id in test])
print(len(X_train))
print(len(X_dev))
print(len(X_test))
5438 2331 3019
print(y_train[0])
print(X_train[0])
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] BARUCH-FOSTER CORP <BFO> 4TH QTR LOSS Shr loss 92 cts vs profit one ct Net loss 2,487,439 vs profit 48,709 Revs 1,788,141 vs 4,167,070 Year Shr loss 1.50 dlrs vs profit 48 cts Net loss 4,073,724 vs profit 1,309,412 Revs 8,193,455 vs 15.7 mln NOTE: 1986 year net includes 3,095,305 dlr writedown of oil properties and reserves.
# Convert texts to sequence of indexes and PADDING
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_WORDS = 100000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=MAX_WORDS,oov_token='__UNK__')
tokenizer.fit_on_texts(X_train)
train_seqs = tokenizer.texts_to_sequences(X_train)
dev_seqs = tokenizer.texts_to_sequences(X_dev)
test_seqs = tokenizer.texts_to_sequences(X_test)
train_data = pad_sequences(train_seqs, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
dev_data = pad_sequences(dev_seqs, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
test_data = pad_sequences(test_seqs, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
print(train_data[0])
[14878 2663 57 19 14879 168 84 36 51 36 610 20 11 59 65 878 28 36 32 4426 4689 11 59 728 5349 87 16 3583 2596 11 46 3285 4427 21 51 36 16 127 12 11 59 728 20 28 36 46 5761 5762 11 59 16 2764 5011 87 69 2597 3976 11 104 58 9 133 41 21 28 238 38 6868 3422 136 1548 4 64 870 7 235 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
Found 23387 unique tokens.
list(word_index.items())[:10]
[('__UNK__', 1), ('the', 2), ('to', 3), ('of', 4), ('in', 5), ('said', 6), ('and', 7), ('a', 8), ('mln', 9), ('for', 10)]
import numpy as np
# Define model's embedding matrix
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_WORDS:
continue
try:
embedding_matrix[i] = word2vec[word]
except:
pass
#Return the number of the elements that are non-zero.
print(len(np.unique(np.nonzero(embedding_matrix)[0])))
# Clean up ram
del word2vec
14811
CNNs are employed over word sequences in three modes:
Causal (Padding-Dilation): a representation per token applying dilated CNNs.
https://theblog.github.io/post/convolution-in-autoregressive-neural-networks/
CNNs like RNNs are usually followed by a pooling mechanism in order to produce a fixed-size sequence representation:
Embedding Layer: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding
Conv1D Layer: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv1D
MaxPooling1D Layer: https://www.tensorflow.org/api_docs/python/tf/keras/layers/GlobalMaxPool1D
# Create and train a CNN model with trigram filters
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
FILTERS = 128 # the dimensionality of the output space (i.e. the number of output filters in the convolution)
KERNEL = 3 # the length of the 1D convolution window
DENSE = 64
N_CLASSES = 90
with tf.device('/device:GPU:0'):
# create empty sequential model
model = Sequential()
# add an embedding layer
model.add(Embedding(input_dim=MAX_WORDS, # Size of the vocabulary, i.e. maximum integer index + 1
output_dim=EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH, trainable=False))
# add 0.2 dropout probabillity
model.add(Dropout(0.2))
# add a stack of convolution layers
model.add(Conv1D(filters=FILTERS, kernel_size=KERNEL, activation='relu', padding='valid'))
model.add(Conv1D(filters=FILTERS, kernel_size=KERNEL, activation='relu', padding='valid'))
model.add(Conv1D(filters=FILTERS, kernel_size=KERNEL, activation='relu', padding='valid'))
model.add(Conv1D(filters=FILTERS, kernel_size=KERNEL, activation='relu', padding='valid'))
model.add(Conv1D(filters=FILTERS, kernel_size=KERNEL, activation='relu', padding='valid'))
# max pooling
model.add(GlobalMaxPooling1D())
# add 0.2 dropout probabillity
model.add(Dropout(0.2))
# add dense layer
model.add(Dense(DENSE, activation='relu'))
# add final linear layer
model.add(Dense(N_CLASSES, activation='sigmoid'))
print(model.summary())
model.compile(loss='binary_crossentropy',
optimizer=Adam(learning_rate=0.001), #change lr?
metrics=[precision, recall, f1, accuracy])
checkpoint = ModelCheckpoint('/content/checkpoints/keras_CNN_model', monitor='val_f1', verbose=1, save_best_only=True, mode='max')
history = model.fit(train_data, y_train,
batch_size=32,
epochs=15,
verbose = 0,
callbacks=[checkpoint],
validation_data=(dev_data, y_dev),
shuffle=True)
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 250, 300) 30000000 dropout (Dropout) (None, 250, 300) 0 conv1d (Conv1D) (None, 248, 128) 115328 conv1d_1 (Conv1D) (None, 246, 128) 49280 conv1d_2 (Conv1D) (None, 244, 128) 49280 conv1d_3 (Conv1D) (None, 242, 128) 49280 conv1d_4 (Conv1D) (None, 240, 128) 49280 global_max_pooling1d (Glob (None, 128) 0 alMaxPooling1D) dropout_1 (Dropout) (None, 128) 0 dense (Dense) (None, 64) 8256 dense_1 (Dense) (None, 90) 5850 ================================================================= Total params: 30326554 (115.69 MB) Trainable params: 326554 (1.25 MB) Non-trainable params: 30000000 (114.44 MB) _________________________________________________________________ None Epoch 1: val_f1 improved from -inf to 0.48786, saving model to /content/checkpoints/keras_CNN_model Epoch 2: val_f1 improved from 0.48786 to 0.57261, saving model to /content/checkpoints/keras_CNN_model Epoch 3: val_f1 improved from 0.57261 to 0.57663, saving model to /content/checkpoints/keras_CNN_model Epoch 4: val_f1 improved from 0.57663 to 0.58660, saving model to /content/checkpoints/keras_CNN_model Epoch 5: val_f1 improved from 0.58660 to 0.62014, saving model to /content/checkpoints/keras_CNN_model Epoch 6: val_f1 did not improve from 0.62014 Epoch 7: val_f1 improved from 0.62014 to 0.62749, saving model to /content/checkpoints/keras_CNN_model Epoch 8: val_f1 improved from 0.62749 to 0.64389, saving model to /content/checkpoints/keras_CNN_model Epoch 9: val_f1 improved from 0.64389 to 0.69694, saving model to /content/checkpoints/keras_CNN_model Epoch 10: val_f1 improved from 0.69694 to 0.73469, saving model to /content/checkpoints/keras_CNN_model Epoch 11: val_f1 did not improve from 0.73469 Epoch 12: val_f1 improved from 0.73469 to 0.75680, saving model to /content/checkpoints/keras_CNN_model Epoch 13: val_f1 improved from 0.75680 to 0.76387, saving model to /content/checkpoints/keras_CNN_model Epoch 14: val_f1 improved from 0.76387 to 0.77538, saving model to /content/checkpoints/keras_CNN_model Epoch 15: val_f1 did not improve from 0.77538
%matplotlib inline
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for f1
plt.plot(history.history['f1'])
plt.plot(history.history['val_f1'])
plt.title('model f1')
plt.ylabel('f1')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()
from sklearn.metrics import accuracy_score, f1_score
def predict_data(data):
T = model.predict(data)
T2 = np.zeros_like(T)
for i in range(T.shape[0]):
T2[i] = T[i]/np.max(T[i])
C = np.ones((90,))/2
T = (C<=T2)
return T*1
print(f'Dev accuracy: {accuracy_score(y_dev, predict_data(dev_data))*100:.2f}%')
print(f'Dev micro-f1: {f1_score(y_dev, predict_data(dev_data), average="micro")*100:.2f}%')
print(f'Test accuracy: {accuracy_score(y_test, predict_data(test_data))*100:.2f}%')
print(f'Test micro-f1: {f1_score(y_test, predict_data(test_data), average="micro")*100:.2f}%')
73/73 [==============================] - 1s 5ms/step Dev accuracy: 71.34% 73/73 [==============================] - 0s 4ms/step Dev micro-f1: 71.26% 95/95 [==============================] - 0s 4ms/step Test accuracy: 72.21% 95/95 [==============================] - 0s 3ms/step Test micro-f1: 70.91%
from IPython.display import Image
Image('Multi_filter_CNN.png')
#Create and train a CNN model with (2,3,4)-gram filters using Keras functional API
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding, GlobalMaxPooling1D, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
import tensorflow.keras.backend as K
FILTERS = 128 # the dimensionality of the output space (i.e. the number of output filters in the convolution)
DENSE = 256
N_CLASSES = 90
with tf.device('/device:GPU:0'):
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
# Embeddings
embeddings = Embedding(input_dim=MAX_WORDS, # Size of the vocabulary, i.e. maximum integer index + 1
output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH, trainable=False)(inputs)
# Dropout over Embeddings
dropped_embeddings = Dropout(rate=0.2)(embeddings)
# Multi-filter CNNs
pooled_convs = []
filter_sizes = [2,3,4]
for n_gram in filter_sizes:
# n-gram convolutions with padding
convs = Conv1D(filters=FILTERS, kernel_size=n_gram, strides=1,
padding="same", activation='relu',
name='{}-gram_Convolutions'.format(n_gram))(dropped_embeddings)
# Max-Pooling over time
pooled_convs.append(GlobalMaxPooling1D(name='{}-gram_MaxPool'.format(n_gram))(convs))
# Concatenation of filters form all window sizes
concat = concatenate(pooled_convs)
concat = Dropout(rate=0.5)(concat)
outputs = Dense(N_CLASSES, activation='sigmoid')(concat)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy',
optimizer=Adam(lr=0.001),
metrics=[precision, recall, f1, accuracy])
print(model.summary())
checkpoint = ModelCheckpoint('/content/checkpoints/keras_Deep_CNN_model', monitor='val_f1', verbose=1, save_best_only=True, mode='max')
history2 = model.fit(train_data,
y_train,
batch_size=32,
epochs=15,
verbose = 0,
callbacks=[checkpoint],
validation_data=(dev_data,y_dev),
shuffle=True)
WARNING:absl:`lr` is deprecated in Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.Adam.
Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) [(None, 250)] 0 [] embedding_1 (Embedding) (None, 250, 300) 3000000 ['input_1[0][0]'] 0 dropout_2 (Dropout) (None, 250, 300) 0 ['embedding_1[0][0]'] 2-gram_Convolutions (Conv1 (None, 250, 128) 76928 ['dropout_2[0][0]'] D) 3-gram_Convolutions (Conv1 (None, 250, 128) 115328 ['dropout_2[0][0]'] D) 4-gram_Convolutions (Conv1 (None, 250, 128) 153728 ['dropout_2[0][0]'] D) 2-gram_MaxPool (GlobalMaxP (None, 128) 0 ['2-gram_Convolutions[0][0]'] ooling1D) 3-gram_MaxPool (GlobalMaxP (None, 128) 0 ['3-gram_Convolutions[0][0]'] ooling1D) 4-gram_MaxPool (GlobalMaxP (None, 128) 0 ['4-gram_Convolutions[0][0]'] ooling1D) concatenate (Concatenate) (None, 384) 0 ['2-gram_MaxPool[0][0]', '3-gram_MaxPool[0][0]', '4-gram_MaxPool[0][0]'] dropout_3 (Dropout) (None, 384) 0 ['concatenate[0][0]'] dense_2 (Dense) (None, 90) 34650 ['dropout_3[0][0]'] ================================================================================================== Total params: 30380634 (115.89 MB) Trainable params: 380634 (1.45 MB) Non-trainable params: 30000000 (114.44 MB) __________________________________________________________________________________________________ None Epoch 1: val_f1 improved from -inf to 0.57886, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 2: val_f1 improved from 0.57886 to 0.71221, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 3: val_f1 improved from 0.71221 to 0.75416, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 4: val_f1 improved from 0.75416 to 0.78435, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 5: val_f1 improved from 0.78435 to 0.80364, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 6: val_f1 improved from 0.80364 to 0.81720, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 7: val_f1 improved from 0.81720 to 0.82584, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 8: val_f1 improved from 0.82584 to 0.82811, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 9: val_f1 improved from 0.82811 to 0.83925, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 10: val_f1 improved from 0.83925 to 0.84980, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 11: val_f1 did not improve from 0.84980 Epoch 12: val_f1 improved from 0.84980 to 0.85831, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 13: val_f1 did not improve from 0.85831 Epoch 14: val_f1 improved from 0.85831 to 0.86194, saving model to /content/checkpoints/keras_Deep_CNN_model Epoch 15: val_f1 improved from 0.86194 to 0.86400, saving model to /content/checkpoints/keras_Deep_CNN_model
%matplotlib inline
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history2.history['accuracy'])
plt.plot(history2.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper right')
plt.show()
from sklearn.metrics import accuracy_score, f1_score
def predict_data(data):
T = model.predict(data)
T2 = np.zeros_like(T)
for i in range(T.shape[0]):
T2[i] = T[i]/np.max(T[i])
C = np.ones((90,))/2
T = (C<=T2)
return T*1
print(f'Dev accuracy: {accuracy_score(y_dev, predict_data(dev_data))*100:.2f}%')
print(f'Dev micro-f1: {f1_score(y_dev, predict_data(dev_data), average="micro")*100:.2f}%')
print(f'Test accuracy: {accuracy_score(y_test, predict_data(test_data))*100:.2f}%')
print(f'Test micro-f1: {f1_score(y_test, predict_data(test_data), average="micro")*100:.2f}%')
73/73 [==============================] - 0s 5ms/step Dev accuracy: 80.14% 73/73 [==============================] - 0s 4ms/step Dev micro-f1: 85.45% 95/95 [==============================] - 0s 4ms/step Test accuracy: 79.86% 95/95 [==============================] - 0s 3ms/step Test micro-f1: 83.88%
%matplotlib notebook
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')
from IPython.display import Image
Image('model.png')