Toxic Classification

A corpus of manually labelled comments and classify each comment

In [4]:
import numpy as np
import pandas as pd
import keras
import matplotlib.pyplot as plt
%matplotlib inline
import vis

Get the Data

In [6]:
#!wget http://bit.do/deep_toxic_train -O data/train.zip
In [7]:
df = pd.read_csv('data/train.zip')
In [8]:
df.head()
Out[8]:
id comment_text toxic severe_toxic obscene threat insult identity_hate
0 0000997932d777bf Explanation\nWhy the edits made under my usern... 0 0 0 0 0 0
1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 0 0 0 0 0
2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 0 0 0 0 0
3 0001b41b1c6bb37e "\nMore\nI can't make any real suggestions on ... 0 0 0 0 0 0
4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 0 0 0 0 0

Step 1: Create the Input and Output

In [15]:
import keras
In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
In [19]:
# Tokenizer
max_words = 2000
tokenizer = Tokenizer(num_words=max_words)
In [21]:
train_sentences = df["comment_text"]
train_sentences.head()
Out[21]:
0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object
In [22]:
tokenizer.fit_on_texts(list(train_sentences))
In [23]:
# Index Representation
tokenizer_train = tokenizer.texts_to_sequences(train_sentences)
In [28]:
list(train_sentences)[1]
Out[28]:
"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"
In [37]:
tokenizer_train[1], len(tokenizer_train[1])
Out[37]:
([52, 13, 555, 73, 21, 94, 38, 803, 992, 589, 182], 11)
In [36]:
#Select Padding
number_of_words = [len(comment) for comment in tokenizer_train]
plt.hist(number_of_words, bins=40);
In [43]:
maxlen = 200
X = pad_sequences(tokenizer_train, maxlen=maxlen, padding="post")
In [44]:
X.shape
Out[44]:
(159571, 200)
In [45]:
X[1]
Out[45]:
array([ 52,  13, 555,  73,  21,  94,  38, 803, 992, 589, 182,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0], dtype=int32)
In [49]:
labels = df.iloc[:,2].values
In [48]:
labels.shape
Out[48]:
(159571,)
In [50]:
# Baseline 
1- df.iloc[:,2].sum()/ df.iloc[:,2].count()
Out[50]:
0.9041555169799024
In [51]:
from keras.utils import to_categorical
y = to_categorical(labels)
In [52]:
y.shape
Out[52]:
(159571, 2)
In [53]:
from sklearn.model_selection import train_test_split
In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=42)
In [55]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[55]:
((127656, 200), (31915, 200), (127656, 2), (31915, 2))
In [66]:
X_train[0]
Out[66]:
array([  56,   10,    8,    7,  382,  818, 1373,  186,    2,  850, 1633,
       1754, 1342,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)

Step 2: Create the Model Architecture

In [72]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Flatten, LSTM
In [73]:
model = Sequential()
model.add(Embedding(max_words, output_dim=128, input_length=maxlen))
model.add(LSTM(60))
model.add(Dropout(0.2))
model.add(Dense(2, activation="sigmoid"))
In [74]:
model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_7 (Embedding)      (None, 200, 128)          256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 60)                45360     
_________________________________________________________________
dropout_4 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 122       
=================================================================
Total params: 301,482
Trainable params: 301,482
Non-trainable params: 0
_________________________________________________________________
In [75]:
### Step 3: Complile & Fit model
In [76]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
In [77]:
%%time 
history = model.fit(X_train, y_train, batch_size=32, epochs=1, verbose=2)
Epoch 1/1
 - 1295s - loss: 0.2524 - acc: 0.9214
CPU times: user 40min 1s, sys: 2min 57s, total: 42min 59s
Wall time: 21min 36s