Skip to content

1. Computer Vision

1.1 aps

Este clase implementa la capa APS de este paper: https://arxiv.org/abs/2011.14214

1.1.1 APS(norm=2)

Initializes the class with normalization option.

Parameters:

Name Type Description Default
norm int | float | Literal['fro', 'nuc', 'inf', '-inf'] | None

Normalization type or value, defaults to 2.

2
Source code in src/layers/cv/aps.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def __init__(
    self,
    norm: int | float | Literal["fro", "nuc", "inf", "-inf"] | None = 2,
) -> None:
    """
    Initializes the class with normalization option.

    Args:
        norm: Normalization type or value, defaults to 2.
    """

    # Constructor de la clase
    super().__init__()

    # Definimos los parámetros de la clase
    self._stride = 2
    self.norm = norm

1.1.1.1 forward(input_tensor, return_index=False)

Processes input tensor to extract dominant polyphase component.

Parameters:

Name Type Description Default
input_tensor Tensor

Tensor with shape (B, C, H, W).

required
return_index bool

If True, returns index of dominant component.

False

Returns:

Type Description
Tensor | tuple[Tensor, Tensor]

Output tensor, optionally with index if return_index is True.

Source code in src/layers/cv/aps.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def forward(
    self, input_tensor: torch.Tensor, return_index: bool = False
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    """
    Processes input tensor to extract dominant polyphase component.

    Args:
        input_tensor: Tensor with shape (B, C, H, W).
        return_index: If True, returns index of dominant component.

    Returns:
        Output tensor, optionally with index if return_index is True.
    """

    # Tenemos a la entrada un tensor de (B, C, H, W)
    # El número de componentes polifásicas coincide con el tamaño
    # de paso elevado al cuadrado, porque nos vemos tanto en la
    # altura como en la anchura , en total 4
    poly_a = input_tensor[:, :, :: self._stride, :: self._stride]
    poly_b = input_tensor[:, :, :: self._stride, 1 :: self._stride]
    poly_c = input_tensor[:, :, 1 :: self._stride, :: self._stride]
    poly_d = input_tensor[:, :, 1 :: self._stride, 1 :: self._stride]

    # Combinamos las componentes en un solo tensor (B, P, C, H, W)
    polyphase_combined = torch.stack((poly_a, poly_b, poly_c, poly_d), dim=1)

    # Extraemos las dimensiones
    b, p, _, _, _ = polyphase_combined.size()

    # Combinamos los valores de los canales, altura y anchura del tensor
    polyphase_combined_reshaped = torch.reshape(polyphase_combined, (b, p, -1))

    # Aplicamos la norma a la última dimensión
    polyphase_norms = torch.linalg.vector_norm(
        input=polyphase_combined_reshaped, ord=self.norm, dim=(-1)
    )

    # Seleccionamos el componente polifásico de mayor orden
    polyphase_max_norm = torch.argmax(polyphase_norms)

    # Obtenemos el componente polifásico de mayor orden
    output_tensor = polyphase_combined[:, polyphase_max_norm, ...]

    # En el paper existe la opción de devolver el índice
    if return_index:
        return output_tensor, polyphase_max_norm

    # En caso contrario solo devolvemos el tensor
    return output_tensor

1.2 lps

Este clase implementa la capa APS de este paper: https://arxiv.org/abs/2210.08001

1.2.1 LPS(channel_size, hidden_size)

Initializes the model with specified channel and hidden sizes.

Parameters:

Name Type Description Default
channel_size int

Number of input channels for the Conv2D layer.

required
hidden_size int

Number of hidden units for the Conv2D layer.

required
Source code in src/layers/cv/lps.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(self, channel_size: int, hidden_size: int) -> None:
    """
    Initializes the model with specified channel and hidden sizes.

    Args:
        channel_size: Number of input channels for the Conv2D layer.
        hidden_size: Number of hidden units for the Conv2D layer.
    """

    # Constructor de la clase
    super().__init__()

    # Definimos los parámetros de la clase
    self._stride = 2

    # Definimos el modelo único para cada componente
    self.conv_model = nn.Sequential(
        nn.Conv2d(
            in_channels=channel_size,
            out_channels=hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
        ),
        nn.ReLU(),
        nn.Conv2d(
            in_channels=hidden_size,
            out_channels=hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
        ),
        nn.Flatten(),
        nn.AdaptiveAvgPool2d(1),
    )

1.2.1.1 forward(input_tensor, return_index=False)

Processes input to extract dominant polyphase component.

Parameters:

Name Type Description Default
input_tensor Tensor

Tensor with shape (B, C, H, W).

required
return_index bool

If True, returns index of dominant component.

False

Returns:

Type Description
Tensor | tuple[Tensor, Tensor]

Tensor of dominant component, optionally with index.

Source code in src/layers/cv/lps.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def forward(
    self, input_tensor: torch.Tensor, return_index: bool = False
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    """
    Processes input to extract dominant polyphase component.

    Args:
        input_tensor: Tensor with shape (B, C, H, W).
        return_index: If True, returns index of dominant component.

    Returns:
        Tensor of dominant component, optionally with index.
    """

    # Tenemos a la entrada un tensor de (B, C, H, W)
    # El número de componentes polifásicas coincide con el tamaño
    # de paso elevado al cuadrado, porque nos vemos tanto en la
    # altura como en la anchura , en total 4
    poly_a = input_tensor[:, :, :: self._stride, :: self._stride]
    poly_b = input_tensor[:, :, :: self._stride, 1 :: self._stride]
    poly_c = input_tensor[:, :, 1 :: self._stride, :: self._stride]
    poly_d = input_tensor[:, :, 1 :: self._stride, 1 :: self._stride]

    # Combinamos las componentes en un solo tensor (B, P, C, H, W)
    polyphase_combined = torch.stack((poly_a, poly_b, poly_c, poly_d), dim=1)

    # Utilizamos el modelo basado en convoluciones por cada componente
    _logits = []
    for polyphase in range(polyphase_combined.size()[1]):
        _logits.append(self.conv_model(polyphase_combined[:, polyphase, ...]))
    logits = torch.squeeze(torch.stack(_logits))

    # Aplicamos la norma a la última dimensión
    polyphase_norms = F.gumbel_softmax(logits, tau=1, hard=False)

    # Seleccionamos el componente polifásico de mayor orden
    polyphase_max_norm = torch.argmax(polyphase_norms)

    # Obtenemos el componente polifásico de mayor orden
    output_tensor = polyphase_combined[:, polyphase_max_norm, ...]

    # En el paper existe la opción de devolver el índice
    if return_index:
        return output_tensor, polyphase_max_norm

    # En caso contrario solo devolvemos el tensor
    return output_tensor

1.3 se

Este clase implementa la capa SE de este paper: https://arxiv.org/abs/1709.01507

1.3.1 SqueezeExcitation(channel_size, ratio)

Implements Squeeze-and-Excitation (SE) block.

Parameters:

Name Type Description Default
channel_size int

Number of channels in the input tensor.

required
ratio int

Reduction factor for the compression layer.

required
Source code in src/layers/cv/se.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(self, channel_size: int, ratio: int) -> None:
    """
    Implements Squeeze-and-Excitation (SE) block.

    Args:
        channel_size: Number of channels in the input tensor.
        ratio: Reduction factor for the compression layer.
    """

    # Constructor de la clase
    super().__init__()

    # Vamos a crear un modelo Sequential
    self.se_block = nn.Sequential(
        nn.AdaptiveAvgPool2d((1, 1)),  # (B, C, 1, 1)
        nn.Flatten(),  # (B, C)
        nn.Linear(
            in_features=channel_size, out_features=channel_size // ratio
        ),  # (B, C//ratio)
        nn.ReLU(),  # (B, C//ratio)
        nn.Linear(
            in_features=channel_size // ratio, out_features=channel_size
        ),  # (B, C)
        nn.Sigmoid(),
    )

1.3.1.1 forward(input_tensor)

Applies attention mechanism to input tensor.

Parameters:

Name Type Description Default
input_tensor Tensor

Input tensor with shape (B, C, H, W).

required

Returns:

Type Description
Tensor

Tensor with attention applied, same shape as input.

Source code in src/layers/cv/se.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Applies attention mechanism to input tensor.

    Args:
        input_tensor: Input tensor with shape (B, C, H, W).

    Returns:
        Tensor with attention applied, same shape as input.
    """

    # Primero podemos obtener el tamaño del tensor de entrada
    b, c, _, _ = input_tensor.size()

    # Obtenemos el tensor de aplicar SE
    x = self.se_block(input_tensor)

    # Modificamos el shape del tensor para ajustarlo al input
    x = x.view(b, c, 1, 1)

    # Aplicamos el producto como mecanismo de atención
    return x * input_tensor

1.4 vit

1.4.1 EncoderBlock(d_model, d_ff, h, dropout_rate)

Initialize encoder block module.

Parameters:

Name Type Description Default
d_model int

Number of features in input.

required
d_ff int

Hidden layer feature dimensions.

required
h int

Number of attention heads.

required
dropout_rate float

Dropout rate for layers.

required
Source code in src/layers/cv/vit.py
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
def __init__(self, d_model: int, d_ff: int, h: int, dropout_rate: float) -> None:
    """
    Initialize encoder block module.

    Args:
        d_model: Number of features in input.
        d_ff: Hidden layer feature dimensions.
        h: Number of attention heads.
        dropout_rate: Dropout rate for layers.
    """

    super().__init__()

    # Parametros
    self.d_model = d_model
    self.d_ff = d_ff
    self.h = h
    self.dropout_rate = dropout_rate

    # Definicion de las capas
    self.multi_head_attention_layer = MultiHeadAttention(
        d_model=self.d_model, h=self.h, dropout_rate=self.dropout_rate
    )
    self.residual_layer_1 = ResidualConnection(
        features=d_model, dropout_rate=self.dropout_rate
    )
    self.feed_forward_layer = FeedForward(
        d_model=self.d_model, d_ff=self.d_ff, dropout_rate=self.dropout_rate
    )
    self.residual_layer_2 = ResidualConnection(
        features=d_model, dropout_rate=self.dropout_rate
    )

1.4.1.1 forward(input_tensor, mask=None)

Process input tensor through encoder block.

Parameters:

Name Type Description Default
input_tensor Tensor

Batch of input tensors.

required
mask Tensor | None

Mask tensor, optional.

None

Returns:

Type Description
Tensor

Output tensor after encoder block processing.

Source code in src/layers/cv/vit.py
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
def forward(
    self, input_tensor: torch.Tensor, mask: torch.Tensor | None = None
) -> torch.Tensor:
    """
    Process input tensor through encoder block.

    Args:
        input_tensor: Batch of input tensors.
        mask: Mask tensor, optional.

    Returns:
        Output tensor after encoder block processing.
    """

    # Utilizamos self-attention, por lo que k, q, v son del mismo vector de entrada
    input_tensor = self.residual_layer_1(
        input_tensor,
        lambda x: self.multi_head_attention_layer(k=x, q=x, v=x, mask=mask),
    )

    # Segunda conexión residual con feed-forward
    input_tensor = self.residual_layer_2(
        input_tensor, lambda x: self.feed_forward_layer(x)
    )

    return input_tensor

1.4.2 FeedForward(d_model, d_ff, dropout_rate)

Initialize feed-forward neural network.

Parameters:

Name Type Description Default
d_model int

Input and output feature dimensions.

required
d_ff int

Hidden layer feature dimensions.

required
dropout_rate float

Dropout rate applied on layers.

required
Source code in src/layers/cv/vit.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def __init__(self, d_model: int, d_ff: int, dropout_rate: float) -> None:
    """
    Initialize feed-forward neural network.

    Args:
        d_model: Input and output feature dimensions.
        d_ff: Hidden layer feature dimensions.
        dropout_rate: Dropout rate applied on layers.
    """

    # Constructor de la clase
    super().__init__()

    # Definimos los parámetros de la clase
    self.d_model = d_model
    self.d_ff = d_ff

    # Creamos el modelo secuencial
    self.ffn = nn.Sequential(
        nn.Linear(in_features=self.d_model, out_features=self.d_ff),
        nn.GELU(),
        nn.Dropout(dropout_rate),
        nn.Linear(in_features=self.d_ff, out_features=self.d_model),
    )

1.4.2.1 forward(input_tensor)

Process input tensor through feed-forward layers.

Parameters:

Name Type Description Default
input_tensor Tensor

Batch of input tensors.

required

Returns:

Type Description
Tensor

Output tensor after feed-forward processing.

Source code in src/layers/cv/vit.py
251
252
253
254
255
256
257
258
259
260
261
262
263
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Process input tensor through feed-forward layers.

    Args:
        input_tensor: Batch of input tensors.

    Returns:
        Output tensor after feed-forward processing.
    """

    # (B, sequence_length, d_model)
    return self.ffn(input_tensor)

1.4.3 LayerNormalization(features, eps=1e-06)

Initialize layer normalization module.

Parameters:

Name Type Description Default
features int

Number of features in input.

required
eps float

Small value to avoid division by zero.

1e-06
Source code in src/layers/cv/vit.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def __init__(self, features: int, eps: float = 1e-6) -> None:
    """
    Initialize layer normalization module.

    Args:
        features: Number of features in input.
        eps: Small value to avoid division by zero.
    """

    # Constructor de la clase
    super().__init__()

    # Definimos los parámetros de la clase
    self.features = features
    self.eps = eps

    # Utilizamos un factor alpha para multiplicar el valor de la normalización
    self.alpha = nn.Parameter(torch.ones(self.features))
    # Utilizamos un factor del sesgo para sumar
    self.bias = nn.Parameter(torch.zeros(self.features))

1.4.3.1 forward(input_embedding)

Apply layer normalization to input embeddings.

Parameters:

Name Type Description Default
input_embedding Tensor

Batch of input embeddings.

required

Returns:

Type Description
Tensor

Normalized embeddings.

Source code in src/layers/cv/vit.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def forward(self, input_embedding: torch.Tensor) -> torch.Tensor:
    """
    Apply layer normalization to input embeddings.

    Args:
        input_embedding: Batch of input embeddings.

    Returns:
        Normalized embeddings.
    """

    # (B, sequence_length, d_model)
    mean = torch.mean(input=input_embedding, dim=-1, keepdim=True)
    var = torch.var(input=input_embedding, dim=-1, keepdim=True, unbiased=False)
    return (
        self.alpha * ((input_embedding - mean) / (torch.sqrt(var + self.eps)))
        + self.bias
    )

1.4.4 MultiHeadAttention(d_model, h, dropout_rate)

Initialize multi-head attention module.

Parameters:

Name Type Description Default
d_model int

Number of features in input.

required
h int

Number of attention heads.

required
dropout_rate float

Dropout rate applied on scores.

required
Source code in src/layers/cv/vit.py
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def __init__(self, d_model: int, h: int, dropout_rate: float) -> None:
    """
    Initialize multi-head attention module.

    Args:
        d_model: Number of features in input.
        h: Number of attention heads.
        dropout_rate: Dropout rate applied on scores.
    """

    # Constructor de la clase
    super().__init__()

    # el tamalo de los embeddings debe ser proporcional al número de cabezas
    # para realizar la división, por lo que es el resto ha de ser 0
    if d_model % h != 0:
        raise ValueError("d_model ha de ser divisible entre h")

    self.d_model = d_model
    self.h = h
    self.dropout = nn.Dropout(dropout_rate)

    # Valore establecidos en el paper
    self.d_k = self.d_model // self.h
    self.d_v = self.d_model // self.h

    # Parámetros
    self.W_K = nn.Linear(
        in_features=self.d_model, out_features=self.d_model, bias=False
    )
    self.W_Q = nn.Linear(
        in_features=self.d_model, out_features=self.d_model, bias=False
    )
    self.W_V = nn.Linear(
        in_features=self.d_model, out_features=self.d_model, bias=False
    )
    self.W_OUTPUT_CONCAT = nn.Linear(
        in_features=self.d_model, out_features=self.d_model, bias=False
    )

1.4.4.1 attention(k, q, v, mask=None, dropout=None) staticmethod

Compute attention scores and output.

Parameters:

Name Type Description Default
k Tensor

Key tensor.

required
q Tensor

Query tensor.

required
v Tensor

Value tensor.

required
mask Tensor | None

Mask tensor, optional.

None
dropout Dropout | None

Dropout layer, optional.

None

Returns:

Type Description

Tuple of attention output and scores.

Source code in src/layers/cv/vit.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
@staticmethod
def attention(
    k: torch.Tensor,
    q: torch.Tensor,
    v: torch.Tensor,
    mask: torch.Tensor | None = None,
    dropout: nn.Dropout | None = None,
):
    """
    Compute attention scores and output.

    Args:
        k: Key tensor.
        q: Query tensor.
        v: Value tensor.
        mask: Mask tensor, optional.
        dropout: Dropout layer, optional.

    Returns:
        Tuple of attention output and scores.
    """

    # Primero realizamos el producto matricial con la transpuesta
    # q = (Batch, h, seq_len, d_k)
    # k.T = (Batch, h, d_k, seq_len)
    # matmul_q_k = (Batch, h, seq_len, seq_len)
    matmul_q_k = q @ k.transpose(-2, -1)

    # Luego realizamos el escalado
    d_k = k.shape[-1]
    matmul_q_k_scaled = matmul_q_k / math.sqrt(d_k)

    # El enmascarado es para el decoder, relleno de infinitos
    if mask is not None:
        matmul_q_k_scaled.masked_fill_(mask == 0, -1e9)

    # Obtenemos los scores/puntuación de la atención
    attention_scores = F.softmax(input=matmul_q_k_scaled, dim=-1)

    # Aplicamos dropout
    if dropout is not None:
        attention_scores = dropout(attention_scores)

    # Multiplicamos por el valor
    # attention_scores = (Batch, h, seq_len, seq_len)
    # v = (Batch, h, seq_len, d_k)
    # Output = (Batch, h, seq_len, d_k)
    return (attention_scores @ v), attention_scores

1.4.4.2 forward(k, q, v, mask=None)

Process input tensors through multi-head attention.

Parameters:

Name Type Description Default
k Tensor

Key tensor.

required
q Tensor

Query tensor.

required
v Tensor

Value tensor.

required
mask Tensor | None

Mask tensor, optional.

None

Returns:

Type Description
Tensor

Output tensor after attention processing.

Source code in src/layers/cv/vit.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
def forward(
    self,
    k: torch.Tensor,
    q: torch.Tensor,
    v: torch.Tensor,
    mask: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Process input tensors through multi-head attention.

    Args:
        k: Key tensor.
        q: Query tensor.
        v: Value tensor.
        mask: Mask tensor, optional.

    Returns:
        Output tensor after attention processing.
    """

    # k -> (Batch, seq_len, d_model) igual para el resto
    key_prima = self.W_K(k)
    query_prima = self.W_Q(q)
    value_prima = self.W_V(v)

    # Cambiamos las dimensiones y hacemos el split de los embedding para cada head
    # Pasando de (Batch, seq_len, d_model) a (Batch, seq_len, h, d_k)
    # Para luego pasar de (Batch, seq_len, h, d_k) a (Batch, h, seq_len, d_k)
    key_prima = key_prima.view(
        key_prima.shape[0], key_prima.shape[1], self.h, self.d_k
    ).transpose(1, 2)
    query_prima = query_prima.view(
        query_prima.shape[0], query_prima.shape[1], self.h, self.d_k
    ).transpose(1, 2)
    value_prima = value_prima.view(
        value_prima.shape[0], value_prima.shape[1], self.h, self.d_k
    ).transpose(1, 2)

    # Obtenemos la matriz de atencion y la puntuación
    # attention = (Batch, h, seq_len, d_k)
    # attention_scores = (Batch, h, seq_len, seq_len)
    attention, attention_scores = MultiHeadAttention.attention(
        k=key_prima,
        q=query_prima,
        v=value_prima,
        mask=mask,
        dropout=self.dropout,
    )

    # Tenemos que concatenar la información de todas las cabezas
    # Queremos (Batch, seq_len, d_model)
    # self.d_k = self.d_model // self.h; d_model = d_k * h
    attention = attention.transpose(1, 2)  # (Batch, seq_len, h, d_k)
    b, seq_len, h, d_k = attention.size()
    # Al parecer, contiguous permite evitar errores de memoria
    attention_concat = attention.contiguous().view(
        b, seq_len, h * d_k
    )  # (Batch, seq_len, h * d_k) = (Batch, seq_len, d_model)

    return self.W_OUTPUT_CONCAT(attention_concat)

1.4.5 PatchEmbedding(patch_size_height, patch_size_width, in_channels, d_model)

Initialize patch embedding module.

Parameters:

Name Type Description Default
patch_size_height int

Height of each patch.

required
patch_size_width int

Width of each patch.

required
in_channels int

Number of input channels.

required
d_model int

Dimension of the model.

required
Source code in src/layers/cv/vit.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def __init__(
    self,
    patch_size_height: int,
    patch_size_width: int,
    in_channels: int,
    d_model: int,
) -> None:
    """
    Initialize patch embedding module.

    Args:
        patch_size_height: Height of each patch.
        patch_size_width: Width of each patch.
        in_channels: Number of input channels.
        d_model: Dimension of the model.
    """

    # Constructor de la clase
    super().__init__()

    # Definimos los parámetros de la clase
    self.patch_size_height = patch_size_height
    self.patch_size_width = patch_size_width
    self.in_channels = in_channels
    self.d_model = d_model

    # Esta es una de las diferencias con usar transformers en el texto
    # Aquí usamos FFN en vez de Embedding layer, es una proyección
    # de los pixeles
    self.embedding = nn.Linear(
        in_features=self.in_channels
        * self.patch_size_height
        * self.patch_size_width,
        out_features=self.d_model,
    )

1.4.5.1 forward(input_tensor)

Apply linear projection to input tensor.

Parameters:

Name Type Description Default
input_tensor Tensor

Batch of image patches as a tensor.

required

Returns:

Type Description
Tensor

Tensor after linear projection of patches.

Source code in src/layers/cv/vit.py
105
106
107
108
109
110
111
112
113
114
115
116
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Apply linear projection to input tensor.

    Args:
        input_tensor: Batch of image patches as a tensor.

    Returns:
        Tensor after linear projection of patches.
    """

    return self.embedding(input_tensor)

1.4.6 Patches(patch_size_height, patch_size_width, img_height, img_width)

Initialize patch extraction module.

Parameters:

Name Type Description Default
patch_size_height int

Height of each patch.

required
patch_size_width int

Width of each patch.

required
img_height int

Height of the input image.

required
img_width int

Width of the input image.

required

Raises:

Type Description
ValueError

If img_height not divisible by patch height.

ValueError

If img_width not divisible by patch width.

Source code in src/layers/cv/vit.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    patch_size_height: int,
    patch_size_width: int,
    img_height: int,
    img_width: int,
) -> None:
    """
    Initialize patch extraction module.

    Args:
        patch_size_height: Height of each patch.
        patch_size_width: Width of each patch.
        img_height: Height of the input image.
        img_width: Width of the input image.

    Raises:
        ValueError: If img_height not divisible by patch height.
        ValueError: If img_width not divisible by patch width.
    """

    super().__init__()

    if img_height % patch_size_height != 0:
        raise ValueError(
            "img_height tiene que se divisible entre el patch_size_height"
        )

    if img_width % patch_size_width != 0:
        raise ValueError(
            "img_width tiene que se divisible entre el patch_size_width"
        )

    self.patch_size_height = patch_size_height
    self.patch_size_width = patch_size_width
    self.unfold = nn.Unfold(
        kernel_size=(self.patch_size_height, self.patch_size_width),
        stride=(self.patch_size_height, self.patch_size_width),
    )

1.4.6.1 forward(input_tensor)

Extract patches from input tensor.

Parameters:

Name Type Description Default
input_tensor Tensor

Batch of images as a tensor.

required

Returns:

Type Description
Tensor

Tensor with patches from input images.

Source code in src/layers/cv/vit.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Extract patches from input tensor.

    Args:
        input_tensor: Batch of images as a tensor.

    Returns:
        Tensor with patches from input images.
    """

    # unfold devuelve (b, c * patch_height * patch_width, num_patches)
    patches = self.unfold(input_tensor)
    # Necesitamos (B, NUM_PATCHES, C * patch_size_height * patch_size_width)
    return patches.transpose(2, 1)

1.4.7 PositionalEncoding(d_model, sequence_length, dropout_rate)

Initialize positional encoding module.

Parameters:

Name Type Description Default
d_model int

Dimension of the model.

required
sequence_length int

Max length of input sequences.

required
dropout_rate float

Dropout rate applied on outputs.

required
Source code in src/layers/cv/vit.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def __init__(self, d_model: int, sequence_length: int, dropout_rate: float) -> None:
    """
    Initialize positional encoding module.

    Args:
        d_model: Dimension of the model.
        sequence_length: Max length of input sequences.
        dropout_rate: Dropout rate applied on outputs.
    """

    # Constructor de la clase
    super().__init__()

    # Definimos los parámetros de la clase
    self.d_model = d_model

    # Cuando le damos una secuencia de tokens, tenemos que saber
    # la longitud máxima de la secuencia
    self.sequence_length = sequence_length
    self.dropout = nn.Dropout(dropout_rate)

    # Creamos una matriz del positional embedding
    # (sequence_length, d_model)
    pe_matrix = torch.zeros(size=(self.sequence_length, self.d_model))

    # Crear vector de posiciones
    position = torch.arange(0, self.sequence_length, dtype=torch.float).unsqueeze(1)

    # Crear vector de divisores
    div_term = torch.exp(
        torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
    )

    # Aplicar sin y cos
    pe_matrix[:, 0::2] = torch.sin(position * div_term)
    pe_matrix[:, 1::2] = torch.cos(position * div_term)

    # Tenemos que convertirlo a (1, sequence_length, d_model) para
    # procesarlo por lotes
    pe_matrix = pe_matrix.unsqueeze(0)

    # Esta matriz no se aprende, es fija, la tenemos que guardar con el modelo
    self.register_buffer(name="pe_matrix", tensor=pe_matrix)

1.4.7.1 forward(input_embedding)

Add positional encoding to input embeddings.

Parameters:

Name Type Description Default
input_embedding Tensor

Batch of input embeddings.

required

Returns:

Type Description
Tensor

Embeddings with added positional encoding.

Source code in src/layers/cv/vit.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def forward(self, input_embedding: torch.Tensor) -> torch.Tensor:
    """
    Add positional encoding to input embeddings.

    Args:
        input_embedding: Batch of input embeddings.

    Returns:
        Embeddings with added positional encoding.
    """

    # (B, ..., d_model) -> (B, sequence_length, d_model)
    # Seleccionamos
    x = input_embedding + (
        self.pe_matrix[:, : input_embedding.shape[1], :]  # type: ignore
    ).requires_grad_(False)
    return self.dropout(x)

1.4.8 ResidualConnection(features, dropout_rate)

Initialize residual connection module.

Parameters:

Name Type Description Default
features int

Number of features in input.

required
dropout_rate float

Dropout rate for sublayer output.

required
Source code in src/layers/cv/vit.py
419
420
421
422
423
424
425
426
427
428
429
430
431
def __init__(self, features: int, dropout_rate: float) -> None:
    """
    Initialize residual connection module.

    Args:
        features: Number of features in input.
        dropout_rate: Dropout rate for sublayer output.
    """

    super().__init__()

    self.dropout = nn.Dropout(dropout_rate)
    self.layer_norm = LayerNormalization(features=features)

1.4.8.1 forward(input_tensor, sublayer)

Apply residual connection to sublayer output.

Parameters:

Name Type Description Default
input_tensor Tensor

Original input tensor.

required
sublayer Module

Sublayer module to apply.

required

Returns:

Type Description
Tensor

Tensor with residual connection applied.

Source code in src/layers/cv/vit.py
433
434
435
436
437
438
439
440
441
442
443
444
445
def forward(self, input_tensor: torch.Tensor, sublayer: nn.Module) -> torch.Tensor:
    """
    Apply residual connection to sublayer output.

    Args:
        input_tensor: Original input tensor.
        sublayer: Sublayer module to apply.

    Returns:
        Tensor with residual connection applied.
    """

    return input_tensor + self.dropout(sublayer(self.layer_norm(input_tensor)))

1.4.9 VIT(patch_size_height, patch_size_width, img_height, img_width, in_channels, num_encoders, d_model, d_ff, h, num_classes, dropout_rate)

Initialize Vision Transformer (VIT).

Parameters:

Name Type Description Default
patch_size_height int

Height of each patch.

required
patch_size_width int

Width of each patch.

required
img_height int

Height of input images.

required
img_width int

Width of input images.

required
in_channels int

Number of input channels.

required
num_encoders int

Number of encoder blocks.

required
d_model int

Dimension of the model.

required
d_ff int

Dimension of feed-forward layers.

required
h int

Number of attention heads.

required
num_classes int

Number of output classes.

required
dropout_rate float

Dropout rate for layers.

required
Source code in src/layers/cv/vit.py
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
def __init__(
    self,
    patch_size_height: int,
    patch_size_width: int,
    img_height: int,
    img_width: int,
    in_channels: int,
    num_encoders: int,
    d_model: int,
    d_ff: int,
    h: int,
    num_classes: int,
    dropout_rate: float,
) -> None:
    """
    Initialize Vision Transformer (VIT).

    Args:
        patch_size_height: Height of each patch.
        patch_size_width: Width of each patch.
        img_height: Height of input images.
        img_width: Width of input images.
        in_channels: Number of input channels.
        num_encoders: Number of encoder blocks.
        d_model: Dimension of the model.
        d_ff: Dimension of feed-forward layers.
        h: Number of attention heads.
        num_classes: Number of output classes.
        dropout_rate: Dropout rate for layers.
    """

    super().__init__()

    self.patch_size_height = patch_size_height
    self.patch_size_width = patch_size_width
    self.img_height = img_height
    self.img_width = img_width
    self.in_channels = in_channels
    self.num_encoders = num_encoders
    self.d_model = d_model
    self.d_ff = d_ff
    self.h = h
    self.num_classes = num_classes
    self.dropout_rate = dropout_rate

    # Número de patches
    self.num_patches = (img_height // patch_size_height) * (
        img_width // patch_size_width
    )

    # CLS token permite tener una representación global de todos los inputs
    # de la imagen (de los diferentes embeddings de cada patch)
    self.cls_token = nn.Parameter(torch.randn(1, 1, self.d_model))

    self.patch_layer = Patches(
        patch_size_height=self.patch_size_height,
        patch_size_width=self.patch_size_width,
        img_height=self.img_height,
        img_width=self.img_width,
    )

    self.embeddings = PatchEmbedding(
        patch_size_height=self.patch_size_height,
        patch_size_width=self.patch_size_width,
        in_channels=self.in_channels,
        d_model=self.d_model,
    )

    # Entiendo que la longitud de la secuencia coincide con el numero de patches
    # y un embedding más de la clase,
    self.positional_encoding = PositionalEncoding(
        d_model=self.d_model,
        sequence_length=self.num_patches + 1,
        dropout_rate=self.dropout_rate,
    )

    # Capas del Encoder
    self.encoder_layers = nn.ModuleList(
        [
            EncoderBlock(
                d_model=self.d_model,
                d_ff=self.d_ff,
                h=self.h,
                dropout_rate=self.dropout_rate,
            )
            for _ in range(self.num_encoders)
        ]
    )

    self.layer_norm = LayerNormalization(features=self.d_model)

    self.mlp_classifier = nn.Sequential(
        nn.Linear(in_features=self.d_model, out_features=self.d_model),
        nn.GELU(),
        nn.Dropout(self.dropout_rate),
        nn.Linear(in_features=self.d_model, out_features=num_classes),
    )

1.4.9.1 forward(input_tensor)

Process input tensor through VIT model.

Parameters:

Name Type Description Default
input_tensor Tensor

Batch of input images.

required

Returns:

Type Description
Tensor

Classification output tensor.

Source code in src/layers/cv/vit.py
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Process input tensor through VIT model.

    Args:
        input_tensor: Batch of input images.

    Returns:
        Classification output tensor.
    """

    # Extraemos los patches
    input_patches = self.patch_layer(input_tensor)

    # Convertimso a embeddings los patches
    patch_embeddings = self.embeddings(input_patches)

    # Tenemos que añadir el token de la clase al inicio de la secuencia
    # (B, 1, d_model)
    cls_tokens = self.cls_token.expand(input_tensor.shape[0], -1, -1)
    # (B, num_patches+1, d_model)
    embeddings = torch.cat([cls_tokens, patch_embeddings], dim=1)

    # Añadir positional encoding
    embeddings = self.positional_encoding(embeddings)

    # Encoders del transformer
    encoder_output = embeddings
    for encoder_layer in self.encoder_layers:
        encoder_output = encoder_layer(encoder_output)

    # Usar solo el CLS token para clasificación
    encoder_output = self.layer_norm(encoder_output)
    cls_output = encoder_output[:, 0]

    # Clasificación final
    return self.mlp_classifier(cls_output)

1.5 vq_vae

1.5.1 Decoder(in_channels, num_residuals, out_channels=3, hidden_size=256, kernel_size=4, stride=2)

Initializes a decoder with residual blocks and transpose convolutional layers.

Parameters:

Name Type Description Default
in_channels int

Number of input channels to the decoder.

required
num_residuals int

Number of residual blocks in the decoder.

required
out_channels int

Number of output channels, e.g., RGB.

3
hidden_size int

Number of channels in hidden layers.

256
kernel_size int

Size of the convolutional kernels.

4
stride int

Stride of the convolutional kernels.

2
Source code in src/layers/cv/vq_vae.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def __init__(
    self,
    in_channels: int,
    num_residuals: int,
    out_channels: int = 3,  # Channel output (RGB)
    hidden_size: int = 256,
    kernel_size: int = 4,
    stride: int = 2,
) -> None:
    """
    Initializes a decoder with residual blocks and transpose
    convolutional layers.

    Args:
        in_channels: Number of input channels to the decoder.
        num_residuals: Number of residual blocks in the decoder.
        out_channels: Number of output channels, e.g., RGB.
        hidden_size: Number of channels in hidden layers.
        kernel_size: Size of the convolutional kernels.
        stride: Stride of the convolutional kernels.
    """

    super().__init__()

    self.in_channels = in_channels
    self.num_residuals = num_residuals
    self.out_channels = out_channels
    self.hidden_size = hidden_size
    self.kernel_size = kernel_size
    self.stride = stride

    self.residual_blocks = nn.ModuleList(
        [
            ResidualBlock(
                in_channels=self.in_channels, hidden_size=self.hidden_size
            )
            for _ in range(self.num_residuals)
        ]
    )

    self.model = nn.Sequential(
        nn.ConvTranspose2d(
            in_channels=self.in_channels,
            out_channels=self.hidden_size,
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=1,
        ),
        nn.ConvTranspose2d(
            in_channels=self.hidden_size,
            out_channels=self.out_channels,
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=1,
        ),
    )

1.5.1.1 forward(input_tensor)

Forward pass through the decoder.

Parameters:

Name Type Description Default
input_tensor Tensor

The input tensor to the decoder.

required

Returns:

Type Description
Tensor

A tensor processed by residual blocks and transpose

Tensor

convolutional layers.

Source code in src/layers/cv/vq_vae.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Forward pass through the decoder.

    Args:
        input_tensor: The input tensor to the decoder.

    Returns:
        A tensor processed by residual blocks and transpose
        convolutional layers.
    """

    decoder_output = input_tensor
    for res_block in self.residual_blocks:
        decoder_output = res_block(decoder_output)

    return self.model(decoder_output)

1.5.2 Encoder(in_channels, num_residuals, hidden_size=256, kernel_size=4, stride=2)

Initializes an encoder with convolutional layers and residual blocks.

Parameters:

Name Type Description Default
in_channels int

Number of input channels to the encoder.

required
num_residuals int

Number of residual blocks in the encoder.

required
hidden_size int

Number of channels in hidden layers.

256
kernel_size int

Size of the convolutional kernels.

4
stride int

Stride of the convolutional kernels.

2
Source code in src/layers/cv/vq_vae.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def __init__(
    self,
    in_channels: int,
    num_residuals: int,
    hidden_size: int = 256,
    kernel_size: int = 4,
    stride: int = 2,
) -> None:
    """
    Initializes an encoder with convolutional layers and residual
    blocks.

    Args:
        in_channels: Number of input channels to the encoder.
        num_residuals: Number of residual blocks in the encoder.
        hidden_size: Number of channels in hidden layers.
        kernel_size: Size of the convolutional kernels.
        stride: Stride of the convolutional kernels.
    """

    super().__init__()

    self.in_channels = in_channels
    self.num_residuals = num_residuals
    self.hidden_size = hidden_size
    self.kernel_size = kernel_size
    self.stride = stride

    self.model = nn.Sequential(
        nn.Conv2d(
            in_channels=in_channels,
            out_channels=hidden_size,
            kernel_size=kernel_size,
            stride=stride,
            padding=1,
        ),
        nn.Conv2d(
            in_channels=hidden_size,
            out_channels=hidden_size,
            kernel_size=kernel_size,
            stride=stride,
            padding=1,
        ),
    )

    self.residual_blocks = nn.ModuleList(
        [
            ResidualBlock(in_channels=hidden_size, hidden_size=hidden_size)
            for _ in range(self.num_residuals)
        ]
    )

1.5.2.1 forward(input_tensor)

Forward pass through the encoder.

Parameters:

Name Type Description Default
input_tensor Tensor

The input tensor to the encoder.

required

Returns:

Type Description
Tensor

A tensor processed by convolutional layers and residual

Tensor

blocks.

Source code in src/layers/cv/vq_vae.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Forward pass through the encoder.

    Args:
        input_tensor: The input tensor to the encoder.

    Returns:
        A tensor processed by convolutional layers and residual
        blocks.
    """

    encoder_output = self.model(input_tensor)
    for res_block in self.residual_blocks:
        encoder_output = res_block(encoder_output)
    return encoder_output

1.5.3 ResidualBlock(in_channels, hidden_size=256)

Initializes a residual block that applies two convolutional layers and ReLU activations.

Parameters:

Name Type Description Default
in_channels int

Number of input channels for the block.

required
hidden_size int

Number of channels in the hidden layer.

256
Source code in src/layers/cv/vq_vae.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def __init__(self, in_channels: int, hidden_size: int = 256) -> None:
    """
    Initializes a residual block that applies two convolutional
    layers and ReLU activations.

    Args:
        in_channels: Number of input channels for the block.
        hidden_size: Number of channels in the hidden layer.
    """

    super().__init__()

    self.in_channels = in_channels
    self.hidden_size = hidden_size

    self.res_block = nn.Sequential(
        nn.ReLU(),
        nn.Conv2d(
            in_channels=self.in_channels,
            out_channels=self.hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=False,
        ),
        nn.ReLU(),
        nn.Conv2d(
            in_channels=self.hidden_size,
            out_channels=self.in_channels,
            kernel_size=1,
            stride=1,
            bias=False,
        ),
    )

1.5.3.1 forward(input_tensor)

Forward pass through the residual block.

Parameters:

Name Type Description Default
input_tensor Tensor

The input tensor to the block.

required

Returns:

Type Description
Tensor

A tensor that is the sum of the input tensor and the

Tensor

block's output.

Source code in src/layers/cv/vq_vae.py
43
44
45
46
47
48
49
50
51
52
53
54
55
def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
    """
    Forward pass through the residual block.

    Args:
        input_tensor: The input tensor to the block.

    Returns:
        A tensor that is the sum of the input tensor and the
        block's output.
    """

    return input_tensor + self.res_block(input_tensor)

1.5.4 VQVAE(in_channels, size_discrete_space, size_embeddings, num_residuals, hidden_size, kernel_size, stride, beta=0.25)

Initializes a VQ-VAE model with encoder, decoder, and quantizer.

Parameters:

Name Type Description Default
in_channels int

Number of input channels for the model.

required
size_discrete_space int

Number of discrete embeddings.

required
size_embeddings int

Size of each embedding vector.

required
num_residuals int

Number of residual blocks in encoder/decoder.

required
hidden_size int

Number of channels in hidden layers.

required
kernel_size int

Size of convolutional kernels.

required
stride int

Stride of convolutional kernels.

required
beta float

Weighting factor for the commitment loss.

0.25
Source code in src/layers/cv/vq_vae.py
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def __init__(
    self,
    in_channels: int,
    size_discrete_space: int,
    size_embeddings: int,
    num_residuals: int,
    hidden_size: int,
    kernel_size: int,
    stride: int,
    beta: float = 0.25,
) -> None:
    """
    Initializes a VQ-VAE model with encoder, decoder, and quantizer.

    Args:
        in_channels: Number of input channels for the model.
        size_discrete_space: Number of discrete embeddings.
        size_embeddings: Size of each embedding vector.
        num_residuals: Number of residual blocks in encoder/decoder.
        hidden_size: Number of channels in hidden layers.
        kernel_size: Size of convolutional kernels.
        stride: Stride of convolutional kernels.
        beta: Weighting factor for the commitment loss.
    """

    super().__init__()

    self.in_channels = in_channels
    self.size_discrete_space = size_discrete_space
    self.size_embeddings = size_embeddings
    self.num_residuals = num_residuals
    self.hidden_size = hidden_size
    self.kernel_size = kernel_size
    self.stride = stride
    self.beta = beta

    self.encoder = Encoder(
        in_channels=self.in_channels,
        num_residuals=self.num_residuals,
        hidden_size=self.hidden_size,
        kernel_size=self.kernel_size,
        stride=self.stride,
    )
    self.decoder = Decoder(
        in_channels=self.hidden_size,
        num_residuals=self.num_residuals,
        out_channels=self.in_channels,
        hidden_size=self.hidden_size,
        kernel_size=self.kernel_size,
        stride=self.stride,
    )

    self.vector_quantizer = VectorQuantizer(
        size_discrete_space=self.size_discrete_space,
        size_embeddings=self.hidden_size,
        beta=self.beta,
    )

1.5.4.1 forward(input_tensor)

Forward pass through VQ-VAE model.

Parameters:

Name Type Description Default
input_tensor Tensor

Input tensor to the model.

required

Returns:

Type Description
Tensor

A tuple containing VQ loss, reconstructed tensor,

Tensor

and perplexity.

Source code in src/layers/cv/vq_vae.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def forward(
    self, input_tensor: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Forward pass through VQ-VAE model.

    Args:
        input_tensor: Input tensor to the model.

    Returns:
        A tuple containing VQ loss, reconstructed tensor,
        and perplexity.
    """

    encoder_output = self.encoder(input_tensor)
    vq_loss, quantized, perplexity, _ = self.vector_quantizer(encoder_output)
    decoder_output = self.decoder(quantized)
    return vq_loss, decoder_output, perplexity

1.5.5 VectorQuantizer(size_discrete_space, size_embeddings, beta=0.25)

Initializes a vector quantizer with a learnable codebook.

Parameters:

Name Type Description Default
size_discrete_space int

Number of discrete embeddings.

required
size_embeddings int

Size of each embedding vector.

required
beta float

Weighting factor for the commitment loss.

0.25
Source code in src/layers/cv/vq_vae.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def __init__(
    self, size_discrete_space: int, size_embeddings: int, beta: float = 0.25
) -> None:
    """
    Initializes a vector quantizer with a learnable codebook.

    Args:
        size_discrete_space: Number of discrete embeddings.
        size_embeddings: Size of each embedding vector.
        beta: Weighting factor for the commitment loss.
    """

    super().__init__()

    self.size_discrete_space = size_discrete_space
    self.size_embeddings = size_embeddings
    self.beta = beta

    # Definimos el codebook como una matriz de K embeddings x D tamaño de embeddings
    # Ha de ser una matriz aprendible
    self.codebook = nn.Embedding(
        num_embeddings=self.size_discrete_space, embedding_dim=self.size_embeddings
    )
    # Initialize weights uniformly
    self.codebook.weight.data.uniform_(
        -1 / self.size_discrete_space, 1 / self.size_discrete_space
    )

1.5.5.1 forward(encoder_output)

Quantizes the encoder output using the codebook.

Parameters:

Name Type Description Default
encoder_output Tensor

Tensor of encoder outputs.

required

Returns:

Type Description
Tensor

A tuple containing VQ loss, quantized tensor, perplexity,

Tensor

and encodings.

Source code in src/layers/cv/vq_vae.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
def forward(
    self, encoder_output: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Quantizes the encoder output using the codebook.

    Args:
        encoder_output: Tensor of encoder outputs.

    Returns:
        A tuple containing VQ loss, quantized tensor, perplexity,
        and encodings.
    """

    # Comentario de otras implementaciones: The channels are used as the space
    # in which to quantize.
    # Encoder output ->  (B, C, H, W) -> (0, 1, 2, 3) -> (0, 2, 3, 1) -> (0*2*3, 1)
    encoder_output = encoder_output.permute(0, 2, 3, 1).contiguous()
    b, h, w, c = encoder_output.size()
    encoder_output_flat = encoder_output.reshape(-1, c)

    # Calculamos la distancia entre ambos vectores
    distances = (
        torch.sum(encoder_output_flat**2, dim=1, keepdim=True)
        + torch.sum(self.codebook.weight**2, dim=1)
        - 2 * torch.matmul(encoder_output_flat, self.codebook.weight.t())
    )

    # Realizamos el encoding y extendemos una dimension (B*H*W, 1)
    encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)

    # Matriz de ceros de (indices, size_discrete_space)
    encodings = torch.zeros(
        encoding_indices.shape[0],
        self.size_discrete_space,
        device=encoder_output.device,
    )
    # Colocamos un 1 en los indices de los encodings con el
    # valor mínimo de distancia creando un vector one-hot
    encodings.scatter_(1, encoding_indices, 1)

    # Se cuantiza colocando un cero en los pesos no relevantes (distancias grandes)
    # del codebook y le damos formato de nuevo al tensor
    quantized = torch.matmul(encodings, self.codebook.weight).view(b, h, w, c)

    # VQ-VAE loss terms
    # L = ||sg[z_e] - e||^2 + β||z_e - sg[e]||^2
    # FIX: Corrected variable names and loss calculation
    commitment_loss = F.mse_loss(
        quantized.detach(), encoder_output
    )  # ||sg[z_e] - e||^2
    embedding_loss = F.mse_loss(
        quantized, encoder_output.detach()
    )  # ||z_e - sg[e]||^2
    vq_loss = commitment_loss + self.beta * embedding_loss

    # Straight-through estimator
    quantized = encoder_output + (quantized - encoder_output).detach()

    # Calculate perplexity
    avg_probs = torch.mean(encodings, dim=0)
    perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))

    # convert quantized from BHWC -> BCHW
    return (
        vq_loss,
        quantized.permute(0, 3, 1, 2).contiguous(),
        perplexity,
        encodings,
    )