How to convert text string to tensor representation
import numpy as np
import pandas as pd
import keras
from keras.preprocessing.text import text_to_word_sequence
sentence = "The quick brown fox jumped over the lazy dog"
Basic Cleaning
text_to_word_sequence(sentence)
from keras.preprocessing.text import one_hot, hashing_trick
# Give a vocabulary size, get representation
one_hot(sentence, n = 10)
hashing_trick(sentence, n = 100, hash_function="md5")
Using an Tokenizer utiliy
from keras.preprocessing.text import Tokenizer
# Instatiate the Tokenizer
simple_tokenizer = Tokenizer()
# Fit the Tokenizer
simple_tokenizer.fit_on_texts([sentence])
# See the word index
simple_tokenizer.word_index
We will work on a set of sentences
sentences = ['The quick brown fox jumped over the lazy dog',
'The dog woke up lazily and barked at the fox',
'The fox looked back and just ignored the dog']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index
# One Hot Encoded array
tokenizer.texts_to_matrix(["the"])
Options to learn Sparse One-Hot Encoded stuff
tokenizer.texts_to_matrix(sentences, mode="binary")
tokenizer.texts_to_matrix(sentences, mode="count")
tokenizer.texts_to_matrix(sentences, mode="freq")
tokenizer.texts_to_matrix(sentences, mode="tfidf")
import spacy
#!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
doc1 = nlp(sentences[0])
for token in doc1:
print(token.text, token.has_vector)
word1 = nlp("dog")
word2 = nlp("fox")
word3 = nlp("the")
word1.similarity(word2)
word1.similarity(word3)