NLP - Embedding¶

How to convert text string to tensor representation

import numpy as np
import pandas as pd
import keras

from keras.preprocessing.text import text_to_word_sequence

sentence = "The quick brown fox jumped over the lazy dog"

Tokenization¶

Uni-gram at word or character level
N-grams at word or character level

Basic Cleaning

Filter for Punctuation
Case change

text_to_word_sequence(sentence)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

Vectorisation or Embedding¶

from keras.preprocessing.text import one_hot, hashing_trick

# Give a vocabulary size, get representation
one_hot(sentence, n = 10)

[7, 8, 3, 7, 3, 9, 7, 8, 4]

hashing_trick(sentence, n = 100, hash_function="md5")

[51, 13, 19, 11, 7, 95, 51, 74, 33]

Using an Tokenizer utiliy

from keras.preprocessing.text import Tokenizer

# Instatiate the Tokenizer
simple_tokenizer = Tokenizer()

# Fit the Tokenizer
simple_tokenizer.fit_on_texts([sentence])

# See the word index
simple_tokenizer.word_index

{'the': 1,
 'quick': 2,
 'brown': 3,
 'fox': 4,
 'jumped': 5,
 'over': 6,
 'lazy': 7,
 'dog': 8}

We will work on a set of sentences

sentences = ['The quick brown fox jumped over the lazy dog',
            'The dog woke up lazily and barked at the fox',
            'The fox looked back and just ignored the dog']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

tokenizer.word_index

{'the': 1,
 'fox': 2,
 'dog': 3,
 'and': 4,
 'quick': 5,
 'brown': 6,
 'jumped': 7,
 'over': 8,
 'lazy': 9,
 'woke': 10,
 'up': 11,
 'lazily': 12,
 'barked': 13,
 'at': 14,
 'looked': 15,
 'back': 16,
 'just': 17,
 'ignored': 18}

# One Hot Encoded array
tokenizer.texts_to_matrix(["the"])

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]])

How to we go from one-hot encoded words to sentence vectors¶

Options to learn Sparse One-Hot Encoded stuff

Binary
Count
Frequency
Tf-Idf

tokenizer.texts_to_matrix(sentences, mode="binary")

array([[0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
        0., 0., 0.],
       [0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 1., 1.]])

tokenizer.texts_to_matrix(sentences, mode="count")

array([[0., 2., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 2., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
        0., 0., 0.],
       [0., 2., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 1., 1.]])

tokenizer.texts_to_matrix(sentences, mode="freq")

array([[0.        , 0.22222222, 0.11111111, 0.11111111, 0.        ,
        0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.2       , 0.1       , 0.1       , 0.1       ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.1       , 0.1       , 0.1       , 0.1       , 0.1       ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.22222222, 0.11111111, 0.11111111, 0.11111111,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.11111111, 0.11111111, 0.11111111, 0.11111111]])

tokenizer.texts_to_matrix(sentences, mode="tfidf")

array([[0.        , 0.94751189, 0.55961579, 0.55961579, 0.        ,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.94751189, 0.55961579, 0.55961579, 0.69314718,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.91629073, 0.91629073, 0.91629073, 0.91629073, 0.91629073,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.94751189, 0.55961579, 0.55961579, 0.69314718,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.91629073, 0.91629073, 0.91629073, 0.91629073]])

Dense Embedding¶

Pre-trained Embedding¶

Learn the pattern of words in a context and get a dense representation of the word vector

Word2Vec (Learnt from word level - skipgram or CBOW)
Fasttext (Learnt from character level)
Glove (Learnt from co-occurence matrix)

Context specific - learn on a corpus

import spacy

#!python -m spacy download en_core_web_lg

nlp = spacy.load('en_core_web_lg')

doc1 = nlp(sentences[0])

for token in doc1:
    print(token.text, token.has_vector)

The True
quick True
brown True
fox True
jumped True
over True
the True
lazy True
dog True

word1 = nlp("dog")
word2 = nlp("fox")
word3 = nlp("the")

word1.similarity(word2)

0.48585482527991497

word1.similarity(word3)

0.2935313775372794