How to we encode / pre-process data in text?
sentence = "The quick brown fox jumped over the lazy dog"
Pre-processing:
import numpy as np
import pandas as pd
from keras.preprocessing.text import text_to_word_sequence
text_to_word_sequence(sentence)
from keras.preprocessing.text import Tokenizer
simple_tokenizer = Tokenizer(num_words=50)
simple_tokenizer.fit_on_texts([sentence])
print(simple_tokenizer.word_index)
sentences = ["The quick brown fox jumped over the lazy dog",
"The dog woke up lazily and barked at the fox",
"the fox looked back and just ignored the dog "]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index
tokenizer.texts_to_sequences(sentences)
tokenizer.texts_to_matrix(sentences, mode="binary")
sent_count = tokenizer.texts_to_matrix(sentences, mode="count")
sent_freq = tokenizer.texts_to_matrix(sentences, mode="freq")
sent_count[0]
sent_freq[0]
sent_count[0]/ sum(sent_count[0])
tokenizer.texts_to_matrix(sentences, mode="tfidf")
import spacy
#!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
doc1 = nlp("fox")
doc2 = nlp("dog")
doc1.vector.shape, doc2.vector.shape
doc1.similarity(doc2)
king = nlp("king")
queen = nlp("queen")
man = nlp("man")
woman = nlp("woman")
pred_queen = king.vector - man.vector + woman.vector
np.dot(pred_queen, queen.vector)/np.linalg.norm(pred_queen)/ np.linalg.norm(queen.vector)