Experimento con embeddings para crear bolsas de palabras
# all_flag
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
from lightdlf_old.cpu.core import Tensor
from lightdlf_old.cpu.layers import Embedding
import numpy as np
x = Embedding(3,5)
print(x)
x.weight
print('indices [0,2]\n',x.weight.index_select(Tensor([[0,2],[1,2]])).data[0])
print('indices [1,2]\n',x.weight.index_select(Tensor([[0,2],[1,2]])).data[1])
y = x.weight.index_select(Tensor([0,2])).sum(0)
print(y)
y = x.weight.index_select(Tensor([[0,2],[1,2]])).sum(1)
print(y)
Palabras utilizadas para un Bag of Words simple¶
Palabras con sus indices:
- buen
- trabajo
- sabrosa
- comida
- fiesta
- divertida
- reunion
- aburrida
- trabajo
- dificil
- grave
- error
Oraciones:
- buen trabajo
- sabrosa comida
- fiesta divertida
- reunion aburrida
- trabajo dificil
- grave error
data = Tensor([[0,1],
[2,3],
[4,5],
[6,7],
[8,9],
[10,11]],
autograd=True)
target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)
embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)
print(embed.weight)
word_set = embed.weight.index_select(data)
print(word_set)
bag = word_set.sum(1)
print(bag)
pred = Tensor.mm(bag,weight)
print(pred)
print(target)
loss = ((pred - target) * (pred - target)).sum(0)
loss.backward(grad=None)
weight.data = weight.data + 0.005 * weight.grad.data
weight.grad.data *= 0
embed.weight.data = embed.weight.data + 0.005 * embed.weight.grad.data
embed.weight.grad.data *= 0
print(loss)
Bag of Words simple¶
np.random.seed(0)
data = Tensor([[0,1],
[2,3],
[4,5],
[6,7],
[8,9],
[10,11]],
autograd=True)
target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)
embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)
for i in range(10):
word_set = embed.weight.index_select(data)
bag = word_set.sum(1)
pred = Tensor.mm(bag,weight)
loss = ((pred - target) * (pred - target)).sum(0)
loss.backward(grad=None)
weight.data = weight.data - 0.5 * weight.grad.data
weight.grad.data *= 0
embed.weight.data = embed.weight.data - 0.05 * embed.weight.grad.data
embed.weight.grad.data *= 0
print(loss)
np.random.seed(0)
data = Tensor([[0,1],
[2,3],
[4,5],
[6,7],
[8,9],
[10,11]],
autograd=True)
target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)
embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)
for i in range(10):
word_set = embed.weight.index_select(data)
bag = word_set.sum(1)
pred = Tensor.mm(bag,weight)
loss = ((pred - target) * (pred - target)).sum(0)
loss.backward(grad=None)
weight.data = weight.data - 0.05 * weight.grad.data
weight.grad.data *= 0
embed.weight.data = embed.weight.data - 0.05 * embed.weight.grad.data
embed.weight.grad.data *= 0
print(loss)
IMDB Bag of Words¶
import sys
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()
f = open('labels.txt')
raw_labels = f.readlines()
f.close()
print(raw_reviews[0])
# Se convierten los reviews en vectores de palabras
tokens = list(map(lambda x: x.split(' '), raw_reviews))
vocab = set()
for oracion in tokens:
for palabra in oracion:
vocab.add(palabra)
vocab = list(vocab)
word2index = {}
for i, palabra in enumerate(vocab):
word2index[palabra] = i
# Generacion del Dataset de entrada
input_dataset = list()
for oracion in tokens:
oracion_indices = set()
for palabra in oracion:
try:
oracion_indices.add(word2index[palabra])
except:
''
input_dataset.append(list(oracion_indices))
print(input_dataset[0])
# Generacion del dataset de salida
target_dataset = list()
for label in raw_labels:
if label == 'positive\n':
target_dataset.append([1])
else:
target_dataset.append([0])
print(target_dataset[:10])
import numpy as np
np.random.seed(0)
# data = Tensor(input_dataset, autograd=True)
target = Tensor(target_dataset, autograd=True)
embed = Embedding(len(vocab), 100)
linear = Tensor((np.random.randn(100, 1) * np.sqrt(2.0 / (100))), autograd=True)
# for i in range(10):
# for j in range(len(target.data)):
# bag = embed.weight.index_select(input_dataset[j])
bag1 = embed.weight.index_select(Tensor([input_dataset[3]])).sum(1)
bag2 = embed.weight.index_select(Tensor(input_dataset[3])).sum(0)
pred1 = bag1.mm(linear)
pred2 = bag2.mm(linear)
print(pred1, pred2)
test = Tensor(target_dataset[0], autograd=True)
print(test)
from IPython.display import clear_output
import numpy as np
np.random.seed(0)
target = Tensor(target_dataset, autograd=True)
embed = Embedding(len(vocab), 100)
linear = Tensor((np.random.randn(100, 1) * np.sqrt(2.0 / (100))), autograd=True)
loss_progression = list()
for i in range(2):
samples = len(target.data) - 15000
acum_loss = 0
for j in range(samples):
bag = embed.weight.index_select(Tensor([input_dataset[j]])).sum(1)
# print('bag:',bag.data.shape)
pred = bag.mm(linear)
# print('pred:',pred.data.shape)
target_j = Tensor([target_dataset[j]], autograd=True)
# print('target:', target_j.data.shape)
loss = ((pred - target_j) * (pred - target_j))
# print('loss:', loss.data.shape)
acum_loss = acum_loss + loss.data[0]
loss.backward(grad=None)
linear.data = linear.data - (0.01 * linear.grad.data)
linear.grad.data *= 0
embed.weight.data = embed.weight.data - (0.01 * embed.weight.grad.data)
embed.weight.grad.data *= 0
if(j % (samples/1000) == 0):
clear_output()
# sys.stdout.write('\n')
sys.stdout.write(str((j/samples)*100))
loss_progression.append(acum_loss/samples)
print(loss_progression[i])
print(loss_progression)
Obs.: El entrenamiento del modelo no es tan eficiente haciendo bag of words
que si lo hicieramos con retropropagación manual
En el caso de frameworks conocidos, como el caso de Pytorch, existen clases diseñadas para este tipo de tareas como el caso de EmbeddingBag y embedding_bag