Toxic Classification¶

A corpus of manually labelled comments and classify each comment

import numpy as np
import pandas as pd
import keras
import matplotlib.pyplot as plt
%matplotlib inline
import vis

Get the Data¶

#!wget http://bit.do/deep_toxic_train -O data/train.zip

df = pd.read_csv('data/train.zip')

df.head()

Step 1: Create the Input and Output¶

import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenizer
max_words = 2000
tokenizer = Tokenizer(num_words=max_words)

train_sentences = df["comment_text"]
train_sentences.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

tokenizer.fit_on_texts(list(train_sentences))

# Index Representation
tokenizer_train = tokenizer.texts_to_sequences(train_sentences)

list(train_sentences)[1]

"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"

tokenizer_train[1], len(tokenizer_train[1])

([52, 13, 555, 73, 21, 94, 38, 803, 992, 589, 182], 11)

#Select Padding
number_of_words = [len(comment) for comment in tokenizer_train]
plt.hist(number_of_words, bins=40);

maxlen = 200
X = pad_sequences(tokenizer_train, maxlen=maxlen, padding="post")

X.shape

(159571, 200)

X[1]

array([ 52,  13, 555,  73,  21,  94,  38, 803, 992, 589, 182,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0], dtype=int32)

labels = df.iloc[:,2].values

labels.shape

(159571,)

# Baseline 
1- df.iloc[:,2].sum()/ df.iloc[:,2].count()

0.9041555169799024

from keras.utils import to_categorical
y = to_categorical(labels)

y.shape

(159571, 2)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127656, 200), (31915, 200), (127656, 2), (31915, 2))

X_train[0]

array([  56,   10,    8,    7,  382,  818, 1373,  186,    2,  850, 1633,
       1754, 1342,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0], dtype=int32)

Step 2: Create the Model Architecture¶

from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Flatten, LSTM

model = Sequential()
model.add(Embedding(max_words, output_dim=128, input_length=maxlen))
model.add(LSTM(60))
model.add(Dropout(0.2))
model.add(Dense(2, activation="sigmoid"))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_7 (Embedding)      (None, 200, 128)          256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 60)                45360     
_________________________________________________________________
dropout_4 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 122       
=================================================================
Total params: 301,482
Trainable params: 301,482
Non-trainable params: 0
_________________________________________________________________

### Step 3: Complile & Fit model

model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

%%time 
history = model.fit(X_train, y_train, batch_size=32, epochs=1, verbose=2)

Epoch 1/1
 - 1295s - loss: 0.2524 - acc: 0.9214
CPU times: user 40min 1s, sys: 2min 57s, total: 42min 59s
Wall time: 21min 36s

	id	comment_text
0	0000997932d777bf	Explanation\nWhy the edits made under my usern...
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm s...
2	000113f07ec002fd	Hey man, I'm really not trying to edit war. It...
3	0001b41b1c6bb37e	"\nMore\nI can't make any real suggestions on ...
4	0001d958c54c6e35	You, sir, are my hero. Any chance you remember...