Experimento con embeddings para crear bolsas de palabras
# all_flag
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from lightdlf_old.cpu.core import Tensor
from lightdlf_old.cpu.layers import Embedding
import numpy as np
x = Embedding(3,5)
print(x)
<lightdlf.cpu.layers.Embedding object at 0x11330a3c8>
x.weight
array([[-0.09231089, -0.02688597,  0.00918141,  0.00381674, -0.03545999],
       [ 0.09436207, -0.03763539, -0.09253179, -0.04245997, -0.00750637],
       [-0.06457234,  0.00024291,  0.08645136,  0.0597631 , -0.02688918]])
print('indices [0,2]\n',x.weight.index_select(Tensor([[0,2],[1,2]])).data[0])
print('indices [1,2]\n',x.weight.index_select(Tensor([[0,2],[1,2]])).data[1])
indices [0,2]
 [[-0.09231089 -0.02688597  0.00918141  0.00381674 -0.03545999]
 [-0.06457234  0.00024291  0.08645136  0.0597631  -0.02688918]]
indices [1,2]
 [[ 0.09436207 -0.03763539 -0.09253179 -0.04245997 -0.00750637]
 [-0.06457234  0.00024291  0.08645136  0.0597631  -0.02688918]]
y = x.weight.index_select(Tensor([0,2])).sum(0)
print(y)
[-0.15688323 -0.02664306  0.09563277  0.06357984 -0.06234917]
y = x.weight.index_select(Tensor([[0,2],[1,2]])).sum(1)
print(y)
[[-0.15688323 -0.02664306  0.09563277  0.06357984 -0.06234917]
 [ 0.02978974 -0.03739248 -0.00608043  0.01730313 -0.03439555]]

Palabras utilizadas para un Bag of Words simple

Palabras con sus indices:

  1. buen
  2. trabajo
  3. sabrosa
  4. comida
  5. fiesta
  6. divertida
  7. reunion
  8. aburrida
  9. trabajo
  10. dificil
  11. grave
  12. error

Oraciones:

  • buen trabajo
  • sabrosa comida
  • fiesta divertida
  • reunion aburrida
  • trabajo dificil
  • grave error
data = Tensor([[0,1],
               [2,3],
               [4,5],
               [6,7],
               [8,9],
               [10,11]], 
              autograd=True)

target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)

embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)
print(embed.weight)
[[ 0.01129587  0.05411751 -0.01789828  0.0478349  -0.0959201 ]
 [ 0.08772247  0.06548647 -0.08466379  0.06105468 -0.04552785]
 [ 0.03344448 -0.01799966  0.06945104  0.07558596 -0.07014028]
 [ 0.02046598 -0.03709017 -0.04655184 -0.06250056 -0.04449235]
 [-0.08805942  0.07226693 -0.01671041  0.09310067  0.01683078]
 [ 0.0757382  -0.0545573   0.09674004  0.08712794 -0.05208956]
 [ 0.00836074 -0.088845   -0.01288659 -0.03770544 -0.02679251]
 [ 0.05411948  0.09297315  0.08984485  0.07841951  0.03230447]
 [ 0.05794195 -0.03361713 -0.02556826 -0.02620728 -0.07834703]
 [ 0.0523658   0.03139289  0.01005846 -0.06450427  0.06012531]
 [-0.09164736 -0.05735323 -0.05880776  0.08320324  0.05173261]
 [-0.03472729  0.07937562 -0.09114472  0.03899011  0.09192391]]
word_set = embed.weight.index_select(data)
print(word_set)
[[[ 0.01129587  0.05411751 -0.01789828  0.0478349  -0.0959201 ]
  [ 0.08772247  0.06548647 -0.08466379  0.06105468 -0.04552785]]

 [[ 0.03344448 -0.01799966  0.06945104  0.07558596 -0.07014028]
  [ 0.02046598 -0.03709017 -0.04655184 -0.06250056 -0.04449235]]

 [[-0.08805942  0.07226693 -0.01671041  0.09310067  0.01683078]
  [ 0.0757382  -0.0545573   0.09674004  0.08712794 -0.05208956]]

 [[ 0.00836074 -0.088845   -0.01288659 -0.03770544 -0.02679251]
  [ 0.05411948  0.09297315  0.08984485  0.07841951  0.03230447]]

 [[ 0.05794195 -0.03361713 -0.02556826 -0.02620728 -0.07834703]
  [ 0.0523658   0.03139289  0.01005846 -0.06450427  0.06012531]]

 [[-0.09164736 -0.05735323 -0.05880776  0.08320324  0.05173261]
  [-0.03472729  0.07937562 -0.09114472  0.03899011  0.09192391]]]
bag = word_set.sum(1)
print(bag)
[[ 0.09901833  0.11960399 -0.10256207  0.10888958 -0.14144795]
 [ 0.05391046 -0.05508983  0.02289919  0.0130854  -0.11463264]
 [-0.01232122  0.01770963  0.08002963  0.18022861 -0.03525878]
 [ 0.06248022  0.00412815  0.07695827  0.04071407  0.00551196]
 [ 0.11030776 -0.00222424 -0.0155098  -0.09071155 -0.01822172]
 [-0.12637465  0.02202239 -0.14995248  0.12219334  0.14365652]]
pred = Tensor.mm(bag,weight)
print(pred)
print(target)
[[ 0.09221484]
 [ 0.02066811]
 [ 0.10238517]
 [ 0.098156  ]
 [ 0.03204621]
 [-0.07952762]]
[[1]
 [1]
 [1]
 [0]
 [0]
 [0]]
loss = ((pred - target) * (pred - target)).sum(0)
loss.backward(grad=None)
weight.data = weight.data + 0.005 * weight.grad.data
weight.grad.data *= 0
embed.weight.data = embed.weight.data + 0.005 * embed.weight.grad.data
embed.weight.grad.data *= 0
print(loss)
[2.60586344]

Bag of Words simple

np.random.seed(0)

data = Tensor([[0,1],
               [2,3],
               [4,5],
               [6,7],
               [8,9],
               [10,11]], 
              autograd=True)

target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)

embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)

for i in range(10):
    word_set = embed.weight.index_select(data)
    bag = word_set.sum(1)
    pred = Tensor.mm(bag,weight)
    
    loss = ((pred - target) * (pred - target)).sum(0)
    
    loss.backward(grad=None)
    
    weight.data = weight.data - 0.5 * weight.grad.data
    weight.grad.data *= 0
    
    embed.weight.data = embed.weight.data - 0.05 * embed.weight.grad.data
    embed.weight.grad.data *= 0
    
    print(loss)
[2.91412065]
[1.33308181]
[0.17888107]
[0.01642682]
[0.00449219]
[0.00116295]
[0.00037631]
[0.00011018]
[3.4257201e-05]
[1.03142691e-05]
np.random.seed(0)

data = Tensor([[0,1],
               [2,3],
               [4,5],
               [6,7],
               [8,9],
               [10,11]], 
              autograd=True)

target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)

embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)

for i in range(10):
    word_set = embed.weight.index_select(data)
    bag = word_set.sum(1)
    pred = Tensor.mm(bag,weight)
    
    loss = ((pred - target) * (pred - target)).sum(0)
    
    loss.backward(grad=None)
    
    weight.data = weight.data - 0.05 * weight.grad.data
    weight.grad.data *= 0
    
    embed.weight.data = embed.weight.data - 0.05 * embed.weight.grad.data
    embed.weight.grad.data *= 0
    
    print(loss)
[2.91412065]
[1.56928068]
[0.78077574]
[0.32692768]
[0.11212161]
[0.03230925]
[0.008296]
[0.002024]
[0.00049514]
[0.00012654]

IMDB Bag of Words

import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()
print(raw_reviews[0])

# Se convierten los reviews en vectores de palabras
tokens = list(map(lambda x: x.split(' '), raw_reviews))
vocab = set()
for oracion in tokens:
    for palabra in oracion:
        vocab.add(palabra)
vocab = list(vocab)

word2index = {}
for i, palabra in enumerate(vocab):
    word2index[palabra] = i

# Generacion del Dataset de entrada
input_dataset = list()
for oracion in tokens:
    oracion_indices = set()
    for palabra in oracion:
        try:
            oracion_indices.add(word2index[palabra])
        except:
            ''
    input_dataset.append(list(oracion_indices))
print(input_dataset[0])

# Generacion del dataset de salida
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append([1])
    else:
        target_dataset.append([0])   
print(target_dataset[:10])
bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   

[0, 50181, 50184, 50710, 3099, 35875, 43044, 19511, 63549, 8254, 33862, 65607, 42062, 16462, 69713, 66646, 15463, 31337, 26733, 53368, 5760, 55427, 53382, 36491, 11919, 31887, 33937, 4241, 27283, 36508, 61088, 11937, 49319, 3754, 21682, 29362, 55989, 32442, 35002, 68800, 2244, 14022, 26311, 711, 38616, 70366, 19166, 28386, 18665, 16110, 19713, 36106, 58642, 53017, 13604, 35621, 4914, 73526, 15158, 18247, 37708, 18256, 58704, 38741, 30042, 58205, 56162, 69476, 34667, 47980, 7536, 46451, 13695, 41871, 7569, 43410, 14751, 20393, 53171, 38851, 68038, 57291, 71117, 52174, 59353, 22490, 36320, 34272, 3558, 23526, 64495, 38896, 60404, 37369]
[[1], [0], [1], [0], [1], [0], [1], [0], [1], [0]]
import numpy as np
np.random.seed(0)

# data = Tensor(input_dataset, autograd=True)
target = Tensor(target_dataset, autograd=True)
embed = Embedding(len(vocab), 100)
linear = Tensor((np.random.randn(100, 1) * np.sqrt(2.0 / (100))), autograd=True)

# for i in range(10):
#     for j in range(len(target.data)):
#         bag = embed.weight.index_select(input_dataset[j])
bag1 = embed.weight.index_select(Tensor([input_dataset[3]])).sum(1)
bag2 = embed.weight.index_select(Tensor(input_dataset[3])).sum(0)
pred1 = bag1.mm(linear)
pred2 = bag2.mm(linear)
print(pred1, pred2)
[[-0.07938035]] [-0.07938035]
test = Tensor(target_dataset[0], autograd=True)
print(test)
[1]
from IPython.display import clear_output
import numpy as np
np.random.seed(0)

target = Tensor(target_dataset, autograd=True)
embed = Embedding(len(vocab), 100)
linear = Tensor((np.random.randn(100, 1) * np.sqrt(2.0 / (100))), autograd=True)

loss_progression = list()
for i in range(2):
    samples = len(target.data) - 15000
    acum_loss = 0
    for j in range(samples):
        bag = embed.weight.index_select(Tensor([input_dataset[j]])).sum(1)
#         print('bag:',bag.data.shape)
        pred = bag.mm(linear)
#         print('pred:',pred.data.shape)
        
        target_j = Tensor([target_dataset[j]], autograd=True)
#         print('target:', target_j.data.shape)
        loss = ((pred - target_j) * (pred - target_j))
#         print('loss:', loss.data.shape)
        acum_loss = acum_loss + loss.data[0]
        
        loss.backward(grad=None)
        linear.data = linear.data - (0.01 * linear.grad.data)
        linear.grad.data *= 0
        embed.weight.data = embed.weight.data - (0.01 * embed.weight.grad.data)
        embed.weight.grad.data *= 0
        
        if(j % (samples/1000) == 0):
            clear_output()
#             sys.stdout.write('\n')
            sys.stdout.write(str((j/samples)*100))
    loss_progression.append(acum_loss/samples)
    print(loss_progression[i])
print(loss_progression)
99.9[0.09899533]
[array([0.14831021]), array([0.09899533])]

Obs.: El entrenamiento del modelo no es tan eficiente haciendo bag of words que si lo hicieramos con retropropagación manual

En el caso de frameworks conocidos, como el caso de Pytorch, existen clases diseñadas para este tipo de tareas como el caso de EmbeddingBag y embedding_bag

Otras Referencias