# all_flag

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from lightdlf_old.cpu.core import Tensor
from lightdlf_old.cpu.layers import Embedding
import numpy as np

x = Embedding(3,5)
print(x)

<lightdlf.cpu.layers.Embedding object at 0x11330a3c8>

x.weight

array([[-0.09231089, -0.02688597,  0.00918141,  0.00381674, -0.03545999],
       [ 0.09436207, -0.03763539, -0.09253179, -0.04245997, -0.00750637],
       [-0.06457234,  0.00024291,  0.08645136,  0.0597631 , -0.02688918]])

print('indices [0,2]\n',x.weight.index_select(Tensor([[0,2],[1,2]])).data[0])
print('indices [1,2]\n',x.weight.index_select(Tensor([[0,2],[1,2]])).data[1])

indices [0,2]
 [[-0.09231089 -0.02688597  0.00918141  0.00381674 -0.03545999]
 [-0.06457234  0.00024291  0.08645136  0.0597631  -0.02688918]]
indices [1,2]
 [[ 0.09436207 -0.03763539 -0.09253179 -0.04245997 -0.00750637]
 [-0.06457234  0.00024291  0.08645136  0.0597631  -0.02688918]]

y = x.weight.index_select(Tensor([0,2])).sum(0)

print(y)

[-0.15688323 -0.02664306  0.09563277  0.06357984 -0.06234917]

y = x.weight.index_select(Tensor([[0,2],[1,2]])).sum(1)
print(y)

[[-0.15688323 -0.02664306  0.09563277  0.06357984 -0.06234917]
 [ 0.02978974 -0.03739248 -0.00608043  0.01730313 -0.03439555]]

Palabras utilizadas para un Bag of Words simple¶

Palabras con sus indices:

buen
trabajo
sabrosa
comida
fiesta
divertida
reunion
aburrida
trabajo
dificil
grave
error

Oraciones:

buen trabajo
sabrosa comida
fiesta divertida
reunion aburrida
trabajo dificil
grave error

data = Tensor([[0,1],
               [2,3],
               [4,5],
               [6,7],
               [8,9],
               [10,11]], 
              autograd=True)

target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)

embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)
print(embed.weight)

[[ 0.01129587  0.05411751 -0.01789828  0.0478349  -0.0959201 ]
 [ 0.08772247  0.06548647 -0.08466379  0.06105468 -0.04552785]
 [ 0.03344448 -0.01799966  0.06945104  0.07558596 -0.07014028]
 [ 0.02046598 -0.03709017 -0.04655184 -0.06250056 -0.04449235]
 [-0.08805942  0.07226693 -0.01671041  0.09310067  0.01683078]
 [ 0.0757382  -0.0545573   0.09674004  0.08712794 -0.05208956]
 [ 0.00836074 -0.088845   -0.01288659 -0.03770544 -0.02679251]
 [ 0.05411948  0.09297315  0.08984485  0.07841951  0.03230447]
 [ 0.05794195 -0.03361713 -0.02556826 -0.02620728 -0.07834703]
 [ 0.0523658   0.03139289  0.01005846 -0.06450427  0.06012531]
 [-0.09164736 -0.05735323 -0.05880776  0.08320324  0.05173261]
 [-0.03472729  0.07937562 -0.09114472  0.03899011  0.09192391]]

word_set = embed.weight.index_select(data)

print(word_set)

[[[ 0.01129587  0.05411751 -0.01789828  0.0478349  -0.0959201 ]
  [ 0.08772247  0.06548647 -0.08466379  0.06105468 -0.04552785]]

 [[ 0.03344448 -0.01799966  0.06945104  0.07558596 -0.07014028]
  [ 0.02046598 -0.03709017 -0.04655184 -0.06250056 -0.04449235]]

 [[-0.08805942  0.07226693 -0.01671041  0.09310067  0.01683078]
  [ 0.0757382  -0.0545573   0.09674004  0.08712794 -0.05208956]]

 [[ 0.00836074 -0.088845   -0.01288659 -0.03770544 -0.02679251]
  [ 0.05411948  0.09297315  0.08984485  0.07841951  0.03230447]]

 [[ 0.05794195 -0.03361713 -0.02556826 -0.02620728 -0.07834703]
  [ 0.0523658   0.03139289  0.01005846 -0.06450427  0.06012531]]

 [[-0.09164736 -0.05735323 -0.05880776  0.08320324  0.05173261]
  [-0.03472729  0.07937562 -0.09114472  0.03899011  0.09192391]]]

bag = word_set.sum(1)
print(bag)

[[ 0.09901833  0.11960399 -0.10256207  0.10888958 -0.14144795]
 [ 0.05391046 -0.05508983  0.02289919  0.0130854  -0.11463264]
 [-0.01232122  0.01770963  0.08002963  0.18022861 -0.03525878]
 [ 0.06248022  0.00412815  0.07695827  0.04071407  0.00551196]
 [ 0.11030776 -0.00222424 -0.0155098  -0.09071155 -0.01822172]
 [-0.12637465  0.02202239 -0.14995248  0.12219334  0.14365652]]

pred = Tensor.mm(bag,weight)
print(pred)
print(target)

[[ 0.09221484]
 [ 0.02066811]
 [ 0.10238517]
 [ 0.098156  ]
 [ 0.03204621]
 [-0.07952762]]
[[1]
 [1]
 [1]
 [0]
 [0]
 [0]]

loss = ((pred - target) * (pred - target)).sum(0)

loss.backward(grad=None)
weight.data = weight.data + 0.005 * weight.grad.data
weight.grad.data *= 0
embed.weight.data = embed.weight.data + 0.005 * embed.weight.grad.data
embed.weight.grad.data *= 0

print(loss)

[2.60586344]

Bag of Words simple¶

np.random.seed(0)

data = Tensor([[0,1],
               [2,3],
               [4,5],
               [6,7],
               [8,9],
               [10,11]], 
              autograd=True)

target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)

embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)

for i in range(10):
    word_set = embed.weight.index_select(data)
    bag = word_set.sum(1)
    pred = Tensor.mm(bag,weight)
    
    loss = ((pred - target) * (pred - target)).sum(0)
    
    loss.backward(grad=None)
    
    weight.data = weight.data - 0.5 * weight.grad.data
    weight.grad.data *= 0
    
    embed.weight.data = embed.weight.data - 0.05 * embed.weight.grad.data
    embed.weight.grad.data *= 0
    
    print(loss)

[2.91412065]
[1.33308181]
[0.17888107]
[0.01642682]
[0.00449219]
[0.00116295]
[0.00037631]
[0.00011018]
[3.4257201e-05]
[1.03142691e-05]

np.random.seed(0)

data = Tensor([[0,1],
               [2,3],
               [4,5],
               [6,7],
               [8,9],
               [10,11]], 
              autograd=True)

target = Tensor([[1],[1],[1],[0],[0],[0]], autograd=True)

embed = Embedding(12,5)
weight = Tensor(np.random.rand(5,1), autograd=True)

for i in range(10):
    word_set = embed.weight.index_select(data)
    bag = word_set.sum(1)
    pred = Tensor.mm(bag,weight)
    
    loss = ((pred - target) * (pred - target)).sum(0)
    
    loss.backward(grad=None)
    
    weight.data = weight.data - 0.05 * weight.grad.data
    weight.grad.data *= 0
    
    embed.weight.data = embed.weight.data - 0.05 * embed.weight.grad.data
    embed.weight.grad.data *= 0
    
    print(loss)

[2.91412065]
[1.56928068]
[0.78077574]
[0.32692768]
[0.11212161]
[0.03230925]
[0.008296]
[0.002024]
[0.00049514]
[0.00012654]

IMDB Bag of Words¶

import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()
print(raw_reviews[0])

# Se convierten los reviews en vectores de palabras
tokens = list(map(lambda x: x.split(' '), raw_reviews))
vocab = set()
for oracion in tokens:
    for palabra in oracion:
        vocab.add(palabra)
vocab = list(vocab)

word2index = {}
for i, palabra in enumerate(vocab):
    word2index[palabra] = i

# Generacion del Dataset de entrada
input_dataset = list()
for oracion in tokens:
    oracion_indices = set()
    for palabra in oracion:
        try:
            oracion_indices.add(word2index[palabra])
        except:
            ''
    input_dataset.append(list(oracion_indices))
print(input_dataset[0])

# Generacion del dataset de salida
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append([1])
    else:
        target_dataset.append([0])   
print(target_dataset[:10])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   

[0, 50181, 50184, 50710, 3099, 35875, 43044, 19511, 63549, 8254, 33862, 65607, 42062, 16462, 69713, 66646, 15463, 31337, 26733, 53368, 5760, 55427, 53382, 36491, 11919, 31887, 33937, 4241, 27283, 36508, 61088, 11937, 49319, 3754, 21682, 29362, 55989, 32442, 35002, 68800, 2244, 14022, 26311, 711, 38616, 70366, 19166, 28386, 18665, 16110, 19713, 36106, 58642, 53017, 13604, 35621, 4914, 73526, 15158, 18247, 37708, 18256, 58704, 38741, 30042, 58205, 56162, 69476, 34667, 47980, 7536, 46451, 13695, 41871, 7569, 43410, 14751, 20393, 53171, 38851, 68038, 57291, 71117, 52174, 59353, 22490, 36320, 34272, 3558, 23526, 64495, 38896, 60404, 37369]
[[1], [0], [1], [0], [1], [0], [1], [0], [1], [0]]

import numpy as np
np.random.seed(0)

# data = Tensor(input_dataset, autograd=True)
target = Tensor(target_dataset, autograd=True)
embed = Embedding(len(vocab), 100)
linear = Tensor((np.random.randn(100, 1) * np.sqrt(2.0 / (100))), autograd=True)

# for i in range(10):
#     for j in range(len(target.data)):
#         bag = embed.weight.index_select(input_dataset[j])
bag1 = embed.weight.index_select(Tensor([input_dataset[3]])).sum(1)
bag2 = embed.weight.index_select(Tensor(input_dataset[3])).sum(0)
pred1 = bag1.mm(linear)
pred2 = bag2.mm(linear)

print(pred1, pred2)

[[-0.07938035]] [-0.07938035]

test = Tensor(target_dataset[0], autograd=True)
print(test)

[1]

from IPython.display import clear_output
import numpy as np
np.random.seed(0)

target = Tensor(target_dataset, autograd=True)
embed = Embedding(len(vocab), 100)
linear = Tensor((np.random.randn(100, 1) * np.sqrt(2.0 / (100))), autograd=True)

loss_progression = list()
for i in range(2):
    samples = len(target.data) - 15000
    acum_loss = 0
    for j in range(samples):
        bag = embed.weight.index_select(Tensor([input_dataset[j]])).sum(1)
#         print('bag:',bag.data.shape)
        pred = bag.mm(linear)
#         print('pred:',pred.data.shape)
        
        target_j = Tensor([target_dataset[j]], autograd=True)
#         print('target:', target_j.data.shape)
        loss = ((pred - target_j) * (pred - target_j))
#         print('loss:', loss.data.shape)
        acum_loss = acum_loss + loss.data[0]
        
        loss.backward(grad=None)
        linear.data = linear.data - (0.01 * linear.grad.data)
        linear.grad.data *= 0
        embed.weight.data = embed.weight.data - (0.01 * embed.weight.grad.data)
        embed.weight.grad.data *= 0
        
        if(j % (samples/1000) == 0):
            clear_output()
#             sys.stdout.write('\n')
            sys.stdout.write(str((j/samples)*100))
    loss_progression.append(acum_loss/samples)
    print(loss_progression[i])
print(loss_progression)

99.9[0.09899533]
[array([0.14831021]), array([0.09899533])]

Obs.: El entrenamiento del modelo no es tan eficiente haciendo bag of words que si lo hicieramos con retropropagación manual

En el caso de frameworks conocidos, como el caso de Pytorch, existen clases diseñadas para este tipo de tareas como el caso de EmbeddingBag y embedding_bag

Bag of words

Palabras utilizadas para un Bag of Words simple¶

Bag of Words simple¶

IMDB Bag of Words¶

Otras Referencias¶