torch.nn.LSTM pytorch BiLSTM Mnist
程序员文章站
2024-03-25 00:01:59
...
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable
'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
test_dataset = dsets.MNIST(root='./data',
train=False,
transform=transforms.ToTensor())
'''
STEP 2: MAKING DATASET ITERABLE
'''
batch_size = 100
n_iters = 10000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
'''
STEP 3: CREATE MODEL CLASS
'''
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first = True, bidirectional = True)
# Readout layer
self.fc = nn.Linear(hidden_dim*2, output_dim)
def forward(self, x):
#print("x",x.shape)
# Initialize hidden state with zeros
#######################
# USE GPU FOR MODEL #
#######################
if torch.cuda.is_available():
#(num_layers * num_directions, batch, hidden_size)
h0 = Variable(torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim).cuda())
else:
h0 = Variable(torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim))
# Initialize cell state
if torch.cuda.is_available():
c0 = Variable(torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim).cuda())
else:
c0 = Variable(torch.zeros(self.layer_dim*2, x.size(0), self.hidden_dim))
# One time step
out, (hn, cn) = self.lstm(x, (h0,c0))
#print(out.shape, hn.shape, cn.shape)
# Index hidden state of last time step
# out.size() --> 100, 28, 100
# out[:, -1, :] --> 100, 100 --> just want last time step hidden states!
out = self.fc(out[:, -1, :])
# out.size() --> 100, 10
#print("out",out.shape)
return out
'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 2 # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
#######################
# USE GPU FOR MODEL #
#######################
if torch.cuda.is_available():
model.cuda()
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
'''
STEP 7: TRAIN THE MODEL
'''
# Number of steps to unroll
seq_dim = 28
iter = 0
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Load images as Variable
#######################
# USE GPU FOR MODEL #
#######################
if torch.cuda.is_available():
images = Variable(images.view(-1, seq_dim, input_dim).cuda())
labels = Variable(labels.cuda())
else:
images = Variable(images.view(-1, seq_dim, input_dim))
labels = Variable(labels)
# Clear gradients w.r.t. parameters
optimizer.zero_grad()
# Forward pass to get output/logits
# outputs.size() --> 100, 10
outputs = model(images)
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(outputs, labels)
# Getting gradients w.r.t. parameters
loss.backward()
# Updating parameters
optimizer.step()
iter += 1
if iter % 500 == 0:
# Calculate Accuracy
correct = 0
total = 0
# Iterate through test dataset
for images, labels in test_loader:
#######################
# USE GPU FOR MODEL #
#######################
if torch.cuda.is_available():
images = Variable(images.view(-1, seq_dim, input_dim).cuda())
else:
images = Variable(images.view(-1, seq_dim, input_dim))
# Forward pass only to get logits/output
outputs = model(images)
# Get predictions from the maximum value
_, predicted = torch.max(outputs.data, 1)
# Total number of labels
total += labels.size(0)
# Total correct predictions
#######################
# USE GPU FOR MODEL #
#######################
if torch.cuda.is_available():
correct += (predicted.cpu() == labels.cpu()).sum()
else:
correct += (predicted == labels).sum()
accuracy = 100 * correct / total
# Print Loss
print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.data, accuracy))
实验结果:
Iteration: 500. Loss: 2.2969160079956055. Accuracy: 17.350000381469727
Iteration: 1000. Loss: 2.182490825653076. Accuracy: 19.200000762939453
Iteration: 1500. Loss: 1.0228978395462036. Accuracy: 48.369998931884766
Iteration: 2000. Loss: 0.4092962145805359. Accuracy: 85.83999633789062
Iteration: 2500. Loss: 0.11203524470329285. Accuracy: 92.01000213623047
Iteration: 3000. Loss: 0.1796608716249466. Accuracy: 95.13999938964844
Iteration: 3500. Loss: 0.22230899333953857. Accuracy: 95.80000305175781
Iteration: 4000. Loss: 0.08739805966615677. Accuracy: 96.58999633789062
Iteration: 4500. Loss: 0.12571051716804504. Accuracy: 96.1500015258789
Iteration: 5000. Loss: 0.03434598073363304. Accuracy: 97.19999694824219
Iteration: 5500. Loss: 0.1483801007270813. Accuracy: 97.12000274658203
Iteration: 6000. Loss: 0.0666593387722969. Accuracy: 97.86000061035156
Iteration: 6500. Loss: 0.026284243911504745. Accuracy: 98.18000030517578
Iteration: 7000. Loss: 0.02744879201054573. Accuracy: 98.0999984741211
Iteration: 7500. Loss: 0.07149388641119003. Accuracy: 98.31999969482422
Iteration: 8000. Loss: 0.08140204101800919. Accuracy: 97.06999969482422
Iteration: 8500. Loss: 0.04244286194443703. Accuracy: 98.37999725341797
Iteration: 9000. Loss: 0.04651059955358505. Accuracy: 98.08000183105469
Iteration: 9500. Loss: 0.004135502967983484. Accuracy: 98.37999725341797
nn.LSTM源码:
class LSTM(RNNBase):
r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
sequence.
For each element in the input sequence, each layer computes the following
function:
.. math::
\begin{array}{ll} \\
i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
h_t = o_t \odot \tanh(c_t) \\
\end{array}
where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
is the hidden state of the layer at time `t-1` or the initial hidden
state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
:math:`o_t` are the input, forget, cell, and output gates, respectively.
:math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
(:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
variable which is :math:`0` with probability :attr:`dropout`.
If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
Second, the output hidden state of each layer will be multiplied by a learnable projection
matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.
Args:
input_size: The number of expected features in the input `x`
hidden_size: The number of features in the hidden state `h`
num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
would mean stacking two LSTMs together to form a `stacked LSTM`,
with the second LSTM taking in outputs of the first LSTM and
computing the final results. Default: 1
bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
Default: ``True``
batch_first: If ``True``, then the input and output tensors are provided
as (batch, seq, feature). Default: ``False``
dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
LSTM layer except the last layer, with dropout probability equal to
:attr:`dropout`. Default: 0
bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
Inputs: input, (h_0, c_0)
- **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
of the input sequence.
The input can also be a packed variable length sequence.
See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
:func:`torch.nn.utils.rnn.pack_sequence` for details.
- **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
containing the initial hidden state for each element in the batch.
If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
If ``proj_size > 0`` was specified, the shape has to be
`(num_layers * num_directions, batch, proj_size)`.
- **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
containing the initial cell state for each element in the batch.
If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
Outputs: output, (h_n, c_n)
- **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
containing the output features `(h_t)` from the last layer of the LSTM,
for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
given as the input, the output will also be a packed sequence. If ``proj_size > 0``
was specified, output shape will be `(seq_len, batch, num_directions * proj_size)`.
For the unpacked case, the directions can be separated
using ``output.view(seq_len, batch, num_directions, hidden_size)``,
with forward and backward being direction `0` and `1` respectively.
Similarly, the directions can be separated in the packed case.
- **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
containing the hidden state for `t = seq_len`. If ``proj_size > 0``
was specified, ``h_n`` shape will be `(num_layers * num_directions, batch, proj_size)`.
Like *output*, the layers can be separated using
``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*.
- **c_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
containing the cell state for `t = seq_len`.
Attributes:
weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
`(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`
weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
`(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
was specified, the shape will be `(4*hidden_size, proj_size)`.
bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
`(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
`(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
specified.
.. note::
All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
where :math:`k = \frac{1}{\text{hidden\_size}}`
.. include:: ../cudnn_rnn_determinism.rst
.. include:: ../cudnn_persistent_rnn.rst
Examples::
>>> rnn = nn.LSTM(10, 20, 2)
>>> input = torch.randn(5, 3, 10)
>>> h0 = torch.randn(2, 3, 20)
>>> c0 = torch.randn(2, 3, 20)
>>> output, (hn, cn) = rnn(input, (h0, c0))
"""
def __init__(self, *args, **kwargs):
super(LSTM, self).__init__('LSTM', *args, **kwargs)
def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
if batch_sizes is not None:
mini_batch = int(batch_sizes[0])
else:
mini_batch = input.size(0) if self.batch_first else input.size(1)
num_directions = 2 if self.bidirectional else 1
expected_hidden_size = (self.num_layers * num_directions,
mini_batch, self.hidden_size)
return expected_hidden_size
# In the future, we should prevent mypy from applying contravariance rules here.
# See torch/nn/modules/module.py::_forward_unimplemented
def check_forward_args(self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]): # type: ignore
self.check_input(input, batch_sizes)
self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
'Expected hidden[0] size {}, got {}')
self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
'Expected hidden[1] size {}, got {}')
# Same as above, see torch/nn/modules/module.py::_forward_unimplemented
def permute_hidden(self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]) -> Tuple[Tensor, Tensor]: # type: ignore
if permutation is None:
return hx
return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
# Same as above, see torch/nn/modules/module.py::_forward_unimplemented
@overload # type: ignore
@torch._jit_internal._overload_method # noqa: F811
def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: # noqa: F811
pass
# Same as above, see torch/nn/modules/module.py::_forward_unimplemented
@overload
@torch._jit_internal._overload_method # noqa: F811
def forward(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]: # noqa: F811
pass
def forward(self, input, hx=None): # noqa: F811
orig_input = input
# xxx: isinstance check needs to be in conditional for TorchScript to compile
if isinstance(orig_input, PackedSequence):
input, batch_sizes, sorted_indices, unsorted_indices = input
max_batch_size = batch_sizes[0]
max_batch_size = int(max_batch_size)
else:
batch_sizes = None
max_batch_size = input.size(0) if self.batch_first else input.size(1)
sorted_indices = None
unsorted_indices = None
if hx is None:
num_directions = 2 if self.bidirectional else 1
real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
h_zeros = torch.zeros(self.num_layers * num_directions,
max_batch_size, real_hidden_size,
dtype=input.dtype, device=input.device)
c_zeros = torch.zeros(self.num_layers * num_directions,
max_batch_size, self.hidden_size,
dtype=input.dtype, device=input.device)
hx = (h_zeros, c_zeros)
else:
# Each batch of the hidden state should match the input sequence that
# the user believes he/she is passing in.
hx = self.permute_hidden(hx, sorted_indices)
self.check_forward_args(input, hx, batch_sizes)
if batch_sizes is None:
result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
self.dropout, self.training, self.bidirectional, self.batch_first)
else:
result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,
self.num_layers, self.dropout, self.training, self.bidirectional)
output = result[0]
hidden = result[1:]
# xxx: isinstance check needs to be in conditional for TorchScript to compile
if isinstance(orig_input, PackedSequence):
output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
return output_packed, self.permute_hidden(hidden, unsorted_indices)
else:
return output, self.permute_hidden(hidden, unsorted_indices)
下一篇: LSTM
推荐阅读
-
torch.nn.LSTM pytorch BiLSTM Mnist
-
使用pytorch的LSTM实现MNIST数据集分类任务
-
BiLSTM+ Attention Pytorch实现
-
pytorch 中的torch.nn.LSTM函数
-
深度学习入门基础教程(三) CNN做MNIST数据集图像分类 pytorch版代码
-
深度学习Fashion-MNIST:由简到繁的pytorch图像分类(一)
-
NNI样例实践-以mnist-pytorch为样例
-
深度学习(Pytorch) 卷积神经网络训练 fashion mnist
-
PyTorch CNN实战之MNIST手写数字识别示例
-
基于PyTorch的mnist数据集的分类