diff --git a/404.html b/404.html index b7d16e7ab94dce2097726ce3c55dead1b1efc3ab..fa1a4c1192363e9ac3cc41371a5775b3cc3cf19c 100644 --- a/404.html +++ b/404.html @@ -38,6 +38,8 @@ + + @@ -54,6 +56,16 @@ + + + + @@ -71,7 +83,7 @@ @@ -144,6 +156,9 @@
+library(torch) + +# creates example tensors. x requires_grad = TRUE tells that +# we are going to take derivatives over it. +x <- torch_tensor(3, requires_grad = TRUE) +y <- torch_tensor(2) + +# executes the forward operation x^2 +o <- x^2 + +# computes the backward operation for each tensor that is marked with +# requires_grad = TRUE +o$backward() + +# get do/dx = 2 * x (at x = 3) +x$grad
## torch_tensor
+## 6
+## [ CPUFloatType{1} ]
+ +library(torch) + +# creates example tensors. x requires_grad = TRUE tells that +# we are going to take derivatives over it. +dense <- nn_module( + clasname = "dense", + # the initialize function tuns whenever we instantiate the model + initialize = function(in_features, out_features) { + + # just for you to see when this function is called + cat("Calling initialize!") + + # we use nn_parameter to indicate that those tensors are special + # and should be treated as parameters by `nn_module`. + self$w <- nn_parameter(torch_randn(in_features, out_features)) + self$b <- nn_parameter(torch_zeros(out_features)) + + }, + # this function is called whenever we call our model on input. + forward = function(x) { + cat("Calling forward!") + torch_mm(x, self$w) + self$b + } +) + +model <- dense(3, 1)
## Calling initialize!
++# you can get all parameters +model$parameters
## $w
+## torch_tensor
+## 0.01 *
+## 4.5107
+## -45.9509
+## -246.0987
+## [ CPUFloatType{3,1} ]
+##
+## $b
+## torch_tensor
+## 0
+## [ CPUFloatType{1} ]
++# or individually +model$w
## torch_tensor
+## 0.01 *
+## 4.5107
+## -45.9509
+## -246.0987
+## [ CPUFloatType{3,1} ]
++model$b
## torch_tensor
+## 0
+## [ CPUFloatType{1} ]
++# create an input tensor +x <- torch_randn(10, 3) +y_pred <- model(x)
## Calling forward!
+
+y_pred## torch_tensor
+## -0.5823
+## 1.1249
+## 0.7531
+## -0.5148
+## 0.1950
+## -4.0449
+## 2.2684
+## 0.5924
+## 3.6237
+## -1.8567
+## [ CPUFloatType{10,1} ]
+ +library(torch) + +# In deep learning models you don't usually have all your data in RAM +# because you are usually training using mini-batch gradient descent +# thus only needing a mini-batch on RAM each time. + +# In torch we use the `datasets` abstraction to define the process of +# loading data. Once you have defined your dataset you can use torch +# dataloaders that allows you to iterate over this dataset in batches. + +# Note that datasets are optional in torch. They are jut there as a +# recommended way to load data. + +# Below you will see an example of how to create a simple torch dataset +# that pre-process a data.frame into tensors so you can feed them to +# a model. + +df_dataset <- dataset( + "mydataset", + + # the input data to your dataset goes in the initialize function. + # our dataset will take a dataframe and the name of the response + # variable. + initialize = function(df, response_variable) { + self$df <- df[,-which(names(df) == response_variable)] + self$response_variable <- df[[response_variable]] + }, + + # the .getitem method takes an index as input and returns the + # corresponding item from the dataset. + # the index could be anything. the dataframe could have many + # rows for each index and the .getitem method would do some + # kind of aggregation before returning the element. + # in our case the index will be a row of the data.frame, + .getitem = function(index) { + response <- torch_tensor(self$response_variable[index]) + x <- torch_tensor(as.numeric(self$df[index,])) + + # note that the dataloaders will automatically stack tensors + # creating a new dimension + list(x = x, y = response) + }, + + # It's optional, but helpful to define the .length method returning + # the number of elements in the dataset. This is needed if you want + # to shuffle your dataset. + .length = function() { + length(self$response_variable) + } + +) + + +# we can now initialize an instance of our dataset. +# for example +mtcars_dataset <- df_dataset(mtcars, "mpg") + +# now we can get an item with +mtcars_dataset$.getitem(1)
## $x
+## torch_tensor
+## 6.0000
+## 160.0000
+## 110.0000
+## 3.9000
+## 2.6200
+## 16.4600
+## 0.0000
+## 1.0000
+## 4.0000
+## 4.0000
+## [ CPUFloatType{10} ]
+##
+## $y
+## torch_tensor
+## 21
+## [ CPUFloatType{1} ]
++# Given a dataset you can create a dataloader with +dl <- dataloader(mtcars_dataset, batch_size = 15, shuffle = TRUE) + +# we can then loop trough the elements of the dataloader with +for(batch in enumerate(dl)) { + cat("X size: ") + print(batch[[1]]$size()) + cat("Y size: ") + print(batch[[2]]$size()) +}
## X size: [1] 15 10
+## Y size: [1] 15 1
+## X size: [1] 15 10
+## Y size: [1] 15 1
+## X size: [1] 2 10
+## Y size: [1] 2 1
+ Gallery of scripts demonstrating torch functionality.
| Examples | +
|---|
| basic-autograd | +
| basic-nn-module | +
| dataset | +
Adding operations to autograd requires implementing a new autograd_function for each operation. Recall that autograd_functionss are what autograd uses to compute the results and gradients, and encode the operation history. Every new function requires you to implement 2 methods:
forward() - the code that performs the operation. It can take as many arguments as you want, with some of them being optional, if you specify the default values. All kinds of R objects are accepted here. Tensor arguments that track history (i.e., with requires_grad=TRUE) will be converted to ones that don’t track history before the call, and their use will be registered in the graph. Note that this logic won’t traverse lists or any other data structures and will only consider Tensor’s that are direct arguments to the call. You can return either a single Tensor output, or a list of Tensors if there are multiple outputs. Also, please refer to the docs of autograd_function to find descriptions of useful methods that can be called only from forward().
Below you can find code for a linear function:
-linear <- autograd_function( - forward = function(ctx, input, weight, bias = NULL) { - ctx$save_for_backward(input = input, weight = weight, bias = bias) - output <- input$mm(weight$t()) - if (!is.null(bias)) - output <- output + bias$unsqueeze(0)$expand_as(output) +linear <- autograd_function( + forward = function(ctx, input, weight, bias = NULL) { + ctx$save_for_backward(input = input, weight = weight, bias = bias) + output <- input$mm(weight$t()) + if (!is.null(bias)) + output <- output + bias$unsqueeze(0)$expand_as(output) - output - }, - backward = function(ctx, grad_output) { + output + }, + backward = function(ctx, grad_output) { - s <- ctx$saved_variables + s <- ctx$saved_variables - grads <- list( - input = NULL, - weight = NULL, - bias = NULL - ) + grads <- list( + input = NULL, + weight = NULL, + bias = NULL + ) - if (ctx$needs_input_grad$input) - grads$input <- grad_output$mm(s$weight) + if (ctx$needs_input_grad$input) + grads$input <- grad_output$mm(s$weight) - if (ctx$needs_input_grad$weight) - grads$weight <- grad_output$t()$mm(s$input) + if (ctx$needs_input_grad$weight) + grads$weight <- grad_output$t()$mm(s$input) - if (!is.null(s$bias) && ctx$needs_input_grad$bias) - grads$bias <- grad_output$sum(dim = 0) + if (!is.null(s$bias) && ctx$needs_input_grad$bias) + grads$bias <- grad_output$sum(dim = 0) - grads - } -) -
Here, we give an additional example of a function that is parametrized by non-Tensor arguments:
-mul_constant <- autograd_function( - forward = function(ctx, tensor, constant) { - ctx$save_for_backward(constant = constant) - tensor * constant - }, - backward = function(ctx, grad_output) { - v <- ctx$saved_variables - list( - tensor = grad_output * v$constant - ) - } -) -
-x <- torch_tensor(1, requires_grad = TRUE) -o <- mul_constant(x, 2) -o$backward() -x$grad +x <- torch_tensor(1, requires_grad = TRUE) +o <- mul_constant(x, 2) +o$backward() +x$grad #> torch_tensor #> 2 -#> [ CPUFloatType{1} ] -
Note: This is an R port of the official tutorial available here. All credits goes to Soumith Chintala.
+library(torch)Central to all neural networks in torch is the autograd functionality. Let’s first briefly visit this, and we will then go to training our first neural network.
Autograd provides automatic differentiation for all operations on Tensors. It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different.
Let us see this in more simple terms with some examples.
@@ -175,74 +201,67 @@If you want to compute the derivatives, you can call $backward() on a Tensor. If Tensor is a scalar (i.e. it holds a one element data), you don’t need to specify any arguments to backward(), however if it has more elements, you need to specify a gradient argument that is a tensor of matching shape.
Create a tensor and set requires_grad=TRUE to track computation with it:
-x <- torch_ones(2, 2, requires_grad = TRUE) -x +x <- torch_ones(2, 2, requires_grad = TRUE) +x #> torch_tensor #> 1 1 #> 1 1 -#> [ CPUFloatType{2,2} ] -
Do a tensor operation:
-y <- x + 2 -y +y <- x + 2 +y #> torch_tensor #> 3 3 #> 3 3 -#> [ CPUFloatType{2,2} ] -
y was created as a result of an operation, so it has a grad_fn.
-y$grad_fn -#> AddBackward1 -
Do more operations on y
-z <- y * y * 3 -z +z <- y * y * 3 +z #> torch_tensor #> 27 27 #> 27 27 #> [ CPUFloatType{2,2} ] -out <- z$mean() -out +out <- z$mean() +out #> torch_tensor #> 27 -#> [ CPUFloatType{} ] -
$requires_grad_( ... ) changes an existing Tensor’s requires_grad flag in-place. The input flag defaults to FALSE if not given.
-a <- torch_randn(2, 2) -a <- (a * 3) / (a - 1) -a$requires_grad +a <- torch_randn(2, 2) +a <- (a * 3) / (a - 1) +a$requires_grad #> [1] FALSE -a$requires_grad_(TRUE) +a$requires_grad_(TRUE) #> torch_tensor -#> -0.4350 1.4882 -#> -0.5849 9.3457 +#> 1.8070 0.0621 +#> -3.7943 1.4618 #> [ CPUFloatType{2,2} ] -a$requires_grad +a$requires_grad #> [1] TRUE -b <- (a * a)$sum() -b$grad_fn -#> SumBackward0 -
Let’s backprop now. Because out contains a single scalar, out$backward() is equivalent to out$backward(torch.tensor(1.)).
-out$backward() -
Print gradients d(out)/dx
-x$grad +x$grad #> torch_tensor #> 4.5000 4.5000 #> 4.5000 4.5000 -#> [ CPUFloatType{2,2} ] -
You should have got a matrix of 4.5. Let’s call the out Tensor \(o\).
We have that \(o = \frac{1}{4}\sum_i z_i\), \(z_i = 3(x_i+2)^2\) and \(z_i\bigr\rvert_{x_i=1} = 27\). Therefore, \(\frac{\partial o}{\partial x_i} = \frac{3}{2}(x_i+2)\), hence \(\frac{\partial o}{\partial x_i}\bigr\rvert_{x_i=1} = \frac{9}{2} = 4.5\).
Mathematically, if you have a vector valued function \(\vec{y}=f(\vec{x})\), then the gradient of \(\vec{y}\) with respect to \(\vec{x}\) is a Jacobian matrix:
@@ -273,50 +292,46 @@This characteristic of vector-Jacobian product makes it very convenient to feed external gradients into a model that has non-scalar output.
Now let’s take a look at an example of vector-Jacobian product:
-x <- torch_randn(3, requires_grad=TRUE) -y <- 100 * x -y +x <- torch_randn(3, requires_grad=TRUE) +y <- 100 * x +y #> torch_tensor -#> -50.4960 -#> -28.4113 -#> 101.7135 -#> [ CPUFloatType{3} ] -
Now in this case y is no longer a scalar. autograd could not compute the full Jacobian directly, but if we just want the vector-Jacobian product, simply pass the vector to backward as argument:
-v <- torch_tensor(c(0.1, 1.0, 0.0001)) -y$backward(v) +v <- torch_tensor(c(0.1, 1.0, 0.0001)) +y$backward(v) -x$grad +x$grad #> torch_tensor #> 1.0000e+01 #> 1.0000e+02 #> 1.0000e-02 -#> [ CPUFloatType{3} ] -
You can also stop autograd from tracking history on Tensors with $requires_grad=TRUE either by wrapping the code block in with with_no_grad():
-x$requires_grad +x$requires_grad #> [1] TRUE -(x ** 2)$requires_grad +(x ** 2)$requires_grad #> [1] TRUE -with_no_grad({ - print((x ** 2)$requires_grad) -}) -#> [1] FALSE -
-x$requires_grad +x$requires_grad #> [1] TRUE -y <- x$detach() -y$requires_grad +y <- x$detach() +y$requires_grad #> [1] FALSE -x$eq(y)$all() +x$eq(y)$all() #> torch_tensor #> 1 -#> [ CPUBoolType{} ] -
Read Later:
Document about help(autograd_function), vignette("using-autograd"), vignette("extending-autograd").
Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.
For this model we can use normal R flow control to implement the loop, and we can implement weight sharing among the innermost layers by simply reusing the same Module multiple times when defining the forward pass.
We can easily implement this model using nn_module:
-dynamic_net <- nn_module( +dynamic_net <- nn_module( "dynamic_net", # In the constructor we construct three nn_linear instances that we will use # in the forward pass. - initialize = function(D_in, H, D_out) { - self$input_linear <- nn_linear(D_in, H) - self$middle_linear <- nn_linear(H, H) - self$output_linear <- nn_linear(H, D_out) - }, + initialize = function(D_in, H, D_out) { + self$input_linear <- nn_linear(D_in, H) + self$middle_linear <- nn_linear(H, H) + self$output_linear <- nn_linear(H, D_out) + }, # For the forward pass of the model, we randomly choose either 0, 1, 2, or 3 # and reuse the middle_linear Module that many times to compute hidden layer # representations. @@ -185,85 +211,84 @@ # Here we also see that it is perfectly safe to reuse the same Module many # times when defining a computational graph. This is a big improvement from Lua # Torch, where each Module could be used only once. - forward = function(x) { - h_relu <- self$input_linear(x)$clamp(min = 0) - for (i in seq_len(sample.int(4, size = 1))) { - h_relu <- self$middle_linear(h_relu)$clamp(min=0) - } - y_pred <- self$output_linear(h_relu) - y_pred - } -) + forward = function(x) { + h_relu <- self$input_linear(x)$clamp(min = 0) + for (i in seq_len(sample.int(4, size = 1))) { + h_relu <- self$middle_linear(h_relu)$clamp(min=0) + } + y_pred <- self$output_linear(h_relu) + y_pred + } +) -if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data # Setting requires_grad=FALSE (the default) indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Construct our model by instantiating the class defined above -model <- dynamic_net(D_in, H, D_out) +model <- dynamic_net(D_in, H, D_out) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. -loss_fn <- nnf_mse_loss +loss_fn <- nnf_mse_loss # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algorithms. The first argument to the Adam constructor tells the # optimizer which Tensors it should update. -learning_rate <- 1e-4 -optimizer <- optim_sgd(model$parameters, lr=learning_rate, momentum = 0.9) +learning_rate <- 1e-4 +optimizer <- optim_sgd(model$parameters, lr=learning_rate, momentum = 0.9) -for (t in seq_len(500)) { +for (t in seq_len(500)) { # Forward pass: compute predicted y by passing x to the model. Module objects # can be called like functions. When doing so you pass a Tensor of input # data to the Module and it produces a Tensor of output data. - y_pred <- model(x) + y_pred <- model(x) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the # loss. - loss <- loss_fn(y_pred, y) - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", as.numeric(loss), "\n") + loss <- loss_fn(y_pred, y) + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", as.numeric(loss), "\n") # Before the backward pass, use the optimizer object to zero all of the # gradients for the variables it will update (which are the learnable # weights of the model). This is because by default, gradients are # accumulated in buffers( i.e, not overwritten) whenever $backward() # is called. Checkout docs of `autograd_backward` for more details. - optimizer$zero_grad() + optimizer$zero_grad() # Backward pass: compute gradient of the loss with respect to model # parameters - loss$backward() + loss$backward() # Calling the step function on an Optimizer makes an update to its # parameters - optimizer$step() -} -#> Step: 1 : 1.054659 -#> Step: 100 : 1.05705 -#> Step: 200 : 1.048708 -#> Step: 300 : 1.052647 -#> Step: 400 : 1.042869 -#> Step: 500 : 1.039991 -
Sometimes you will want to specify models that are more complex than a sequence of existing Modules; for these cases you can define your own Modules by using nn_module function and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.
In this example we implement our two-layer network as a custom Module subclass:
-two_layer_net <- nn_module( +two_layer_net <- nn_module( "two_layer_net", - initialize = function(D_in, H, D_out) { - self$linear1 <- nn_linear(D_in, H) - self$linear2 <- nn_linear(H, D_out) - }, - forward = function(x) { - x %>% - self$linear1() %>% - nnf_relu() %>% - self$linear2() - } -) + initialize = function(D_in, H, D_out) { + self$linear1 <- nn_linear(D_in, H) + self$linear2 <- nn_linear(H, D_out) + }, + forward = function(x) { + x %>% + self$linear1() %>% + nnf_relu() %>% + self$linear2() + } +) -if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data # Setting requires_grad=FALSE (the default) indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Construct our model by instantiating the class defined above -model <- two_layer_net(D_in, H, D_out) +model <- two_layer_net(D_in, H, D_out) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. -loss_fn <- nnf_mse_loss +loss_fn <- nnf_mse_loss # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algorithms. The first argument to the Adam constructor tells the # optimizer which Tensors it should update. -learning_rate <- 1e-4 -optimizer <- optim_sgd(model$parameters, lr=learning_rate) +learning_rate <- 1e-4 +optimizer <- optim_sgd(model$parameters, lr=learning_rate) -for (t in seq_len(500)) { +for (t in seq_len(500)) { # Forward pass: compute predicted y by passing x to the model. Module objects # can be called like functions. When doing so you pass a Tensor of input # data to the Module and it produces a Tensor of output data. - y_pred <- model(x) + y_pred <- model(x) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the # loss. - loss <- loss_fn(y_pred, y) - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", as.numeric(loss), "\n") + loss <- loss_fn(y_pred, y) + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", as.numeric(loss), "\n") # Before the backward pass, use the optimizer object to zero all of the # gradients for the variables it will update (which are the learnable # weights of the model). This is because by default, gradients are # accumulated in buffers( i.e, not overwritten) whenever $backward() # is called. Checkout docs of `autograd_backward` for more details. - optimizer$zero_grad() + optimizer$zero_grad() # Backward pass: compute gradient of the loss with respect to model # parameters - loss$backward() + loss$backward() # Calling the step function on an Optimizer makes an update to its # parameters - optimizer$step() -} -#> Step: 1 : 1.04065 -#> Step: 100 : 1.026708 -#> Step: 200 : 1.013019 -#> Step: 300 : 0.9996911 -#> Step: 400 : 0.986709 -#> Step: 500 : 0.9740159 -
In the next example we will about dynamic graphs in torch.
diff --git a/articles/getting-started/neural-networks.html b/articles/getting-started/neural-networks.html index 9c40455a7655242b62260f8840edffbc81a1e8e8..ca6fdb7961595079cf0521d1d767a39df312caba 100644 --- a/articles/getting-started/neural-networks.html +++ b/articles/getting-started/neural-networks.html @@ -11,12 +11,19 @@ - + + +Note: This is an R port of the official tutorial available here. All credits goes to Soumith Chintala.
+library(torch)Neural networks can be constructed using the nn functionality.
Now that you had a glimpse of autograd, nn depends on autograd to define models and differentiate them. An nn.Module contains layers, and a method forward(input) that returns the output.
For example, look at this network that classifies digit images:
@@ -182,44 +208,43 @@ Define the networkLet’s define this network:
-Net <- nn_module( - initialize = function() { - self$conv1 = nn_conv2d(1, 6, 3) - self$conv2 = nn_conv2d(6, 16, 3) +Net <- nn_module( + initialize = function() { + self$conv1 = nn_conv2d(1, 6, 3) + self$conv2 = nn_conv2d(6, 16, 3) # an affine operation: y = Wx + b - self$fc1 = nn_linear(16 * 6 * 6, 120) # 6*6 from image dimension - self$fc2 = nn_linear(120, 84) - self$fc3 = nn_linear(84, 10) - }, - forward = function(x) { - x %>% + self$fc1 = nn_linear(16 * 6 * 6, 120) # 6*6 from image dimension + self$fc2 = nn_linear(120, 84) + self$fc3 = nn_linear(84, 10) + }, + forward = function(x) { + x %>% - self$conv1() %>% - nnf_relu() %>% - nnf_max_pool2d(c(2,2)) %>% + self$conv1() %>% + nnf_relu() %>% + nnf_max_pool2d(c(2,2)) %>% - self$conv2() %>% - nnf_relu() %>% - nnf_max_pool2d(c(2,2)) %>% + self$conv2() %>% + nnf_relu() %>% + nnf_max_pool2d(c(2,2)) %>% - torch_flatten(start_dim = 2) %>% + torch_flatten(start_dim = 2) %>% - self$fc1() %>% - nnf_relu() %>% + self$fc1() %>% + nnf_relu() %>% - self$fc2() %>% - nnf_relu() %>% + self$fc2() %>% + nnf_relu() %>% - self$fc3() - } -) + self$fc3() + } +) -net <- Net() -
You just have to define the forward function, and the backward function (where gradients are computed) is automatically defined for you using autograd. You can use any of the Tensor operations in the forward function.
The learnable parameters of a model are returned by net$parameters.
-str(net$parameters) +str(net$parameters) #> List of 10 #> $ conv1.weight:Float [1:6, 1:1, 1:3, 1:3] #> $ conv1.bias :Float [1:6] @@ -230,22 +255,19 @@ #> $ fc2.weight :Float [1:84, 1:120] #> $ fc2.bias :Float [1:84] #> $ fc3.weight :Float [1:10, 1:84] -#> $ fc3.bias :Float [1:10] -
Let’s try a random 32x32 input. Note: expected input size of this net (LeNet) is 32x32. To use this net on the MNIST dataset, please resize the images from the dataset to 32x32.
-input <- torch_randn(1, 1, 32, 32) -out <- net(input) -out +input <- torch_randn(1, 1, 32, 32) +out <- net(input) +out #> torch_tensor -#> -0.0560 0.0916 0.0401 -0.1081 -0.0183 -0.0508 0.1250 -0.0574 0.0058 0.0025 -#> [ CPUFloatType{1,10} ] -
Zero the gradient buffers of all parameters and backprops with random gradients:
-net$zero_grad() -out$backward(torch_randn(1, 10)) -
@@ -284,16 +306,15 @@Note:
nnonly supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample. For example,nn_conv2dwill take in a 4D Tensor of nSamples x nChannels x Height x Width. If you have a single sample, just useinput$unsqueeze(1)to add a fake batch dimension.
There are several different loss functions under the nn package . A simple loss is: nnf_mse_loss which computes the mean-squared error between the input and the target.
For example:
-output <- net(input) -target <- torch_randn(10) # a dummy target, for example -target <- target$view(c(1, -1)) # make it the same shape as output +output <- net(input) +target <- torch_randn(10) # a dummy target, for example +target <- target$view(c(1, -1)) # make it the same shape as output -loss <- nnf_mse_loss(output, target) -loss +loss <- nnf_mse_loss(output, target) +loss #> torch_tensor -#> 0.388282 -#> [ CPUFloatType{} ] -
Now, if you follow loss in the backward direction, using its $grad_fn attribute, you will see a graph of computations that looks like this:
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
-> view -> linear -> relu -> linear -> relu -> linear
@@ -302,13 +323,12 @@
So, when we call loss$backward(), the whole graph is differentiated w.r.t. the loss, and all Tensors in the graph that has requires_grad=True will have their #grad Tensor accumulated with the gradient.
For illustration, let us follow a few steps backward:
-loss$grad_fn
+loss$grad_fn
#> MseLossBackward
-loss$grad_fn$next_functions[[1]]
+loss$grad_fn$next_functions[[1]]
#> AddmmBackward
-loss$grad_fn$next_functions[[1]]$next_functions[[1]]
-#> torch::autograd::AccumulateGrad
-
+loss$grad_fn$next_functions[[1]]$next_functions[[1]]
+#> torch::autograd::AccumulateGrad
To backpropagate the error all we have to do is to loss$backward(). You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.
Now we shall call loss$backward(), and have a look at conv1’s bias gradients before and after the backward.
-net$zero_grad() # zeroes the gradient buffers of all parameters +net$zero_grad() # zeroes the gradient buffers of all parameters # conv1.bias.grad before backward -net$conv1$bias$grad +net$conv1$bias$grad #> torch_tensor #> 0 #> 0 @@ -329,20 +349,19 @@ #> 0 #> [ CPUFloatType{6} ] -loss$backward() +loss$backward() # conv1.bias.grad after backward -net$conv1$bias$grad +net$conv1$bias$grad #> torch_tensor -#> 0.001 * -#> 2.3567 -#> -1.3589 -#> -0.6749 -#> 5.5939 -#> -4.2062 -#> 0.6161 -#> [ CPUFloatType{6} ] -
Now, we have seen how to use loss functions.
\[weight = weight - learning_rate * gradient\]
We can implement this using simple R code:
-learning_rate <- 0.01 -for (f in net$parameters) { - with_no_grad({ - f$sub_(f$grad * learning_rate) - }) -} -
Note: Weight updates here is wraped around
with_no_gradas we don’t the updates to be tracked by the autograd engine.
However, as you use neural networks, you want to use various different update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.
# create your optimizer -optimizer <- optim_sgd(net$parameters, lr = 0.01) +optimizer <- optim_sgd(net$parameters, lr = 0.01) # in your training loop: -optimizer$zero_grad() # zero the gradient buffers -output <- net(input) -loss <- nnf_mse_loss(output, target) -loss$backward() -optimizer$step() # Does the update -#> NULL -
diff --git a/articles/getting-started/new-autograd-functions.html b/articles/getting-started/new-autograd-functions.html index 7b118773a0cfe232f3322afaec1308f95279d6a9..cf978bc00a9681862d01767cabd13ea43a10de53 100644 --- a/articles/getting-started/new-autograd-functions.html +++ b/articles/getting-started/new-autograd-functions.html @@ -11,12 +11,19 @@ - + + +Note: Observe how gradient buffers had to be manually set to zero using
optimizer$zero_grad(). This is because gradients are accumulated as explained in the Backprop section.
Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)Under the hood, each primitive autograd operator is really two functions that operate on Tensors. The forward function computes output Tensors from input Tensors. The backward function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.
In torch we can easily define our own autograd operator by defining a subclass of autograd_function and implementing the forward and backward functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data.
In this example we define our own custom autograd function for performing the ReLU nonlinearity, and use it to implement our two-layer network:
@@ -168,93 +194,92 @@ # We can implement our own custom autograd Functions by subclassing # autograd_functioon and implementing the forward and backward passes # which operate on Tensors. -my_relu <- autograd_function( +my_relu <- autograd_function( # In the forward pass we receive a Tensor containing the input and return # a Tensor containing the output. ctx is a context object that can be used # to stash information for backward computation. You can cache arbitrary # objects for use in the backward pass using the ctx$save_for_backward method. - forward = function(ctx, input) { - ctx$save_for_backward(input = input) - input$clamp(min = 0) - }, + forward = function(ctx, input) { + ctx$save_for_backward(input = input) + input$clamp(min = 0) + }, # In the backward pass we receive a Tensor containing the gradient of the loss # with respect to the output, and we need to compute the gradient of the loss # with respect to the input. - backward = function(ctx, grad_output) { - v <- ctx$saved_variables - grad_input <- grad_output$clone() - grad_input[v$input < 0] <- 0 - list(input = grad_input) - } -) + backward = function(ctx, grad_output) { + v <- ctx$saved_variables + grad_input <- grad_output$clone() + grad_input[v$input < 0] <- 0 + list(input = grad_input) + } +) -if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data # Setting requires_grad=FALSE (the default) indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Randomly initialize weights # Setting requires_grad=TRUE indicates that we want to compute gradients with # respect to these Tensors during the backward pass. -w1 <- torch_randn(D_in, H, device=device, requires_grad = TRUE) -w2 <- torch_randn(H, D_out, device=device, requires_grad = TRUE) +w1 <- torch_randn(D_in, H, device=device, requires_grad = TRUE) +w2 <- torch_randn(H, D_out, device=device, requires_grad = TRUE) -learning_rate <- 1e-6 -for (t in seq_len(500)) { +learning_rate <- 1e-6 +for (t in seq_len(500)) { # Forward pass: compute predicted y using operations on Tensors; these # are exactly the same operations we used to compute the forward pass using # Tensors, but we do not need to keep references to intermediate values since # we are not implementing the backward pass by hand. - y_pred <- my_relu(x$mm(w1))$mm(w2) + y_pred <- my_relu(x$mm(w1))$mm(w2) # Compute and print loss using operations on Tensors. # Now loss is a Tensor of shape (1,) - loss <- (y_pred - y)$pow(2)$sum() - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", as.numeric(loss), "\n") + loss <- (y_pred - y)$pow(2)$sum() + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", as.numeric(loss), "\n") # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. # After this call w1$grad and w2$grad will be Tensors holding the gradient # of the loss with respect to w1 and w2 respectively. - loss$backward() + loss$backward() # Manually update weights using gradient descent. Wrap in `with_no_grad` # because weights have requires_grad=TRUE, but we don't need to track this # in autograd. # You can also use optim_sgd to achieve this. - with_no_grad({ + with_no_grad({ # operations suffixed with an `_` operates on in-place on the tensor. - w1$sub_(learning_rate * w1$grad) - w2$sub_(learning_rate * w2$grad) + w1$sub_(learning_rate * w1$grad) + w2$sub_(learning_rate * w2$grad) # Manually zero the gradients after updating weights - w1$grad$zero_() - w2$grad$zero_() - }) -} -#> Step: 1 : 25332368 -#> Step: 100 : 473.5124 -#> Step: 200 : 2.001738 -#> Step: 300 : 0.01279241 -#> Step: 400 : 0.0002553065 -#> Step: 500 : 4.130635e-05 - + w1$grad$zero_() + w2$grad$zero_() + }) +} +#> Step: 1 : 29097476 +#> Step: 100 : 387.3773 +#> Step: 200 : 1.046869 +#> Step: 300 : 0.005221077 +#> Step: 400 : 0.0001420454 +#> Step: 500 : 2.901151e-05In the next example we will learn how to use the neural networks abstractions in torch.
diff --git a/articles/getting-started/nn.html b/articles/getting-started/nn.html index f2d0e7bc110a7ea6ed8d67a0deb8f664b5f050ad..95b9358054e40d0db7b89a98b05bb70a1b6e4947 100644 --- a/articles/getting-started/nn.html +++ b/articles/getting-started/nn.html @@ -11,12 +11,19 @@ - + + +Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)Computational graphs and autograd are a very powerful paradigm for defining complex operators and automatically taking derivatives; however for large neural networks raw autograd can be a bit too low-level.
When building neural networks we frequently think of arranging the computation into layers, some of which have learnable parameters which will be optimized during learning.
In TensorFlow, packages like Keras, TensorFlow-Slim, and TFLearn provide higher-level abstractions over raw computational graphs that are useful for building neural networks.
In torch, the nn functionality serves this same purpose. The nn feature defines a set of Modules, which are roughly equivalent to neural network layers. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters. The nn collection also defines a set of useful loss functions that are commonly used when training neural networks.
In this example we use nn to implement our two-layer network:
-if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data # Setting requires_grad=FALSE (the default) indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Use the nn package to define our model as a sequence of layers. nn_sequential # is a Module which contains other Modules, and applies them in sequence to # produce its output. Each Linear Module computes output from input using a # linear function, and holds internal Tensors for its weight and bias. -model <- nn_sequential( - nn_linear(D_in, H), - nn_relu(), - nn_linear(H, D_out) -) +model <- nn_sequential( + nn_linear(D_in, H), + nn_relu(), + nn_linear(H, D_out) +) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. -loss_fn <- nnf_mse_loss +loss_fn <- nnf_mse_loss -learning_rate <- 1e-6 -for (t in seq_len(500)) { +learning_rate <- 1e-6 +for (t in seq_len(500)) { # Forward pass: compute predicted y by passing x to the model. Module objects # can be called like functions. When doing so you pass a Tensor of input # data to the Module and it produces a Tensor of output data. - y_pred <- model(x) + y_pred <- model(x) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the # loss. - loss <- loss_fn(y_pred, y) - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", as.numeric(loss), "\n") + loss <- loss_fn(y_pred, y) + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", as.numeric(loss), "\n") # Zero the gradients before running the backward pass. - model$zero_grad() + model$zero_grad() # Backward pass: compute gradient of the loss with respect to all the learnable # parameters of the model. Internally, the parameters of each Module are stored # in Tensors with requires_grad=TRUE, so this call will compute gradients for # all learnable parameters in the model. - loss$backward() + loss$backward() # Update the weights using gradient descent. Each parameter is a Tensor, so # we can access its gradients like we did before. - with_no_grad({ - for (param in model$parameters) { - param$sub_(learning_rate * param$grad) - } - }) -} -#> Step: 1 : 1.04115 -#> Step: 100 : 1.041026 -#> Step: 200 : 1.040901 -#> Step: 300 : 1.040776 -#> Step: 400 : 1.04065 -#> Step: 500 : 1.040525 -
In the next example we will learn how to use optimizers implemented in torch.
diff --git a/articles/getting-started/optim.html b/articles/getting-started/optim.html index d5bcc6308a6f28f8b766ec2bf8fa624b8fbbf4e6..4c93b57aa0ad944bc24ded1620fb4782f2a7895c 100644 --- a/articles/getting-started/optim.html +++ b/articles/getting-started/optim.html @@ -11,12 +11,19 @@ - + + +Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)Up to this point we have updated the weights of our models by manually mutating the Tensors holding learnable parameters (with with_no_grad to avoid tracking history in autograd). This is not a huge burden for simple optimization algorithms like stochastic gradient descent, but in practice we often train neural networks using more sophisticated optimizers like AdaGrad, RMSProp, Adam, etc.
The optim package in torch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms.
In this example we will use the nn package to define our model as before, but we will optimize the model using the Adam algorithm provided by optim:
-if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data # Setting requires_grad=FALSE (the default) indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Use the nn package to define our model as a sequence of layers. nn_sequential # is a Module which contains other Modules, and applies them in sequence to # produce its output. Each Linear Module computes output from input using a # linear function, and holds internal Tensors for its weight and bias. -model <- nn_sequential( - nn_linear(D_in, H), - nn_relu(), - nn_linear(H, D_out) -) +model <- nn_sequential( + nn_linear(D_in, H), + nn_relu(), + nn_linear(H, D_out) +) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. -loss_fn <- nnf_mse_loss +loss_fn <- nnf_mse_loss # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algorithms. The first argument to the Adam constructor tells the # optimizer which Tensors it should update. -learning_rate <- 1e-4 -optimizer <- optim_adam(model$parameters, lr=learning_rate) +learning_rate <- 1e-4 +optimizer <- optim_adam(model$parameters, lr=learning_rate) -for (t in seq_len(500)) { +for (t in seq_len(500)) { # Forward pass: compute predicted y by passing x to the model. Module objects # can be called like functions. When doing so you pass a Tensor of input # data to the Module and it produces a Tensor of output data. - y_pred <- model(x) + y_pred <- model(x) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the # loss. - loss <- loss_fn(y_pred, y) - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", as.numeric(loss), "\n") + loss <- loss_fn(y_pred, y) + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", as.numeric(loss), "\n") # Before the backward pass, use the optimizer object to zero all of the # gradients for the variables it will update (which are the learnable # weights of the model). This is because by default, gradients are # accumulated in buffers( i.e, not overwritten) whenever $backward() # is called. Checkout docs of `autograd_backward` for more details. - optimizer$zero_grad() + optimizer$zero_grad() # Backward pass: compute gradient of the loss with respect to model # parameters - loss$backward() + loss$backward() # Calling the step function on an Optimizer makes an update to its # parameters - optimizer$step() -} -#> Step: 1 : 1.03194 -#> Step: 100 : 0.08338322 -#> Step: 200 : 0.001254716 -#> Step: 300 : 3.605265e-06 -#> Step: 400 : 2.155708e-09 -#> Step: 500 : 5.439427e-13 -
In the next example we will learn how to create custom nn_modules.
Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)In the previous examples, we had to manually implement both the forward and backward passes of our neural network. Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.
Thankfully, we can use automatic differentiation to automate the computation of backward passes in neural networks. The autograd feature in torch provides exactly this functionality. When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.
This sounds complicated, it’s pretty simple to use in practice. Each Tensor represents a node in a computational graph. If x is a Tensor that has x$requires_grad=TRUE then x$grad is another Tensor holding the gradient of x with respect to some scalar value.
Here we use torch Tensors and autograd to implement our two-layer network; now we no longer need to manually implement the backward pass through the network:
-if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data # Setting requires_grad=FALSE (the default) indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Randomly initialize weights # Setting requires_grad=TRUE indicates that we want to compute gradients with # respect to these Tensors during the backward pass. -w1 <- torch_randn(D_in, H, device=device, requires_grad = TRUE) -w2 <- torch_randn(H, D_out, device=device, requires_grad = TRUE) +w1 <- torch_randn(D_in, H, device=device, requires_grad = TRUE) +w2 <- torch_randn(H, D_out, device=device, requires_grad = TRUE) -learning_rate <- 1e-6 -for (t in seq_len(500)) { +learning_rate <- 1e-6 +for (t in seq_len(500)) { # Forward pass: compute predicted y using operations on Tensors; these # are exactly the same operations we used to compute the forward pass using # Tensors, but we do not need to keep references to intermediate values since # we are not implementing the backward pass by hand. - y_pred <- x$mm(w1)$clamp(min=0)$mm(w2) + y_pred <- x$mm(w1)$clamp(min=0)$mm(w2) # Compute and print loss using operations on Tensors. # Now loss is a Tensor of shape (1,) - loss <- (y_pred - y)$pow(2)$sum() - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", as.numeric(loss), "\n") + loss <- (y_pred - y)$pow(2)$sum() + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", as.numeric(loss), "\n") # Use autograd to compute the backward pass. This call will compute the # gradient of loss with respect to all Tensors with requires_grad=True. # After this call w1$grad and w2$grad will be Tensors holding the gradient # of the loss with respect to w1 and w2 respectively. - loss$backward() + loss$backward() # Manually update weights using gradient descent. Wrap in `with_no_grad` # because weights have requires_grad=TRUE, but we don't need to track this # in autograd. # You can also use optim_sgd to achieve this. - with_no_grad({ + with_no_grad({ # operations suffixed with an `_` operates on in-place on the tensor. - w1$sub_(learning_rate * w1$grad) - w2$sub_(learning_rate * w2$grad) + w1$sub_(learning_rate * w1$grad) + w2$sub_(learning_rate * w2$grad) # Manually zero the gradients after updating weights - w1$grad$zero_() - w2$grad$zero_() - }) -} -#> Step: 1 : 27399256 -#> Step: 100 : 756.7294 -#> Step: 200 : 9.271971 -#> Step: 300 : 0.205474 -#> Step: 400 : 0.00579866 -#> Step: 500 : 0.0003981641 -
In the next example we will learn how to create new autograd functions.
diff --git a/articles/getting-started/tensors.html b/articles/getting-started/tensors.html index f79bad2fb5585214b7da72e4d251bef9b4ecb4e0..9eb3f10a7e6c9206061daeab3ac8612e0f58d990 100644 --- a/articles/getting-started/tensors.html +++ b/articles/getting-started/tensors.html @@ -11,12 +11,19 @@ - + + +Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)R arrays are great, but they cannot utilize GPUs to accelerate its numerical computations. For modern deep neural networks, GPUs often provide speedups of 50x or greater, so unfortunately pure R won’t be enough for modern deep learning.
Here we introduce the most fundamental torch concept: the Tensor. A torch Tensor is conceptually similar to an R array: a Tensor is an n-dimensional array, and torch provides many functions for operating on these Tensors. Behind the scenes, Tensors can keep track of a computational graph and gradients, but they’re also useful as a generic tool for scientific computing.
Also unlike R, torch Tensors can utilize GPUs to accelerate their numeric computations. To run a torch Tensor on GPU, you simply need to cast it to a new datatype.
Here we use torch Tensors to fit a two-layer network to random data. Like the R before we need to manually implement the forward and backward passes through the network:
-if (cuda_is_available()) { - device <- torch_device("cuda") -} else { - device <- torch_device("cpu") -} +if (cuda_is_available()) { + device <- torch_device("cuda") +} else { + device <- torch_device("cpu") +} # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data -x <- torch_randn(N, D_in, device=device) -y <- torch_randn(N, D_out, device=device) +x <- torch_randn(N, D_in, device=device) +y <- torch_randn(N, D_out, device=device) # Randomly initialize weights -w1 <- torch_randn(D_in, H, device=device) -w2 <- torch_randn(H, D_out, device=device) +w1 <- torch_randn(D_in, H, device=device) +w2 <- torch_randn(H, D_out, device=device) -learning_rate <- 1e-6 -for (t in seq_len(500)) { +learning_rate <- 1e-6 +for (t in seq_len(500)) { # Forward pass: compute predicted y - h <- x$mm(w1) - h_relu <- h$clamp(min=0) - y_pred <- h_relu$mm(w2) + h <- x$mm(w1) + h_relu <- h$clamp(min=0) + y_pred <- h_relu$mm(w2) # Compute and print loss - loss <- as.numeric((y_pred - y)$pow(2)$sum()) - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", loss, "\n") + loss <- as.numeric((y_pred - y)$pow(2)$sum()) + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", loss, "\n") # Backprop to compute gradients of w1 and w2 with respect to loss - grad_y_pred <- 2.0 * (y_pred - y) - grad_w2 <- h_relu$t()$mm(grad_y_pred) - grad_h_relu <- grad_y_pred$mm(w2$t()) - grad_h <- grad_h_relu$clone() - grad_h[h < 0] <- 0 - grad_w1 <- x$t()$mm(grad_h) + grad_y_pred <- 2.0 * (y_pred - y) + grad_w2 <- h_relu$t()$mm(grad_y_pred) + grad_h_relu <- grad_y_pred$mm(w2$t()) + grad_h <- grad_h_relu$clone() + grad_h[h < 0] <- 0 + grad_w1 <- x$t()$mm(grad_h) # Update weights using gradient descent - w1 <- w1 - learning_rate * grad_w1 - w2 <- w2 - learning_rate * grad_w2 -} -#> Step: 1 : 31418640 -#> Step: 100 : 959.8534 -#> Step: 200 : 34.41588 -#> Step: 300 : 1.953276 -#> Step: 400 : 0.1213879 -#> Step: 500 : 0.008213471 -
In the next example we will use autograd instead of computing the gradients manually.
diff --git a/articles/getting-started/warmup.html b/articles/getting-started/warmup.html index f85e2bf0630bcd80636c187ec9b3e909eea037df..ed4195e826012f7a18f4812233d7f853d5801ec3 100644 --- a/articles/getting-started/warmup.html +++ b/articles/getting-started/warmup.html @@ -11,12 +11,19 @@ - + + +Note: This is an R port of the official tutorial available here. All credits goes to Justin Johnson.
+library(torch)A fully-connected ReLU network with one hidden layer and no biases, trained to predict y from x using Euclidean error.
This implementation uses pure R to manually compute the forward pass, loss, and backward pass.
An R array is a generic n-dimensional array; it does not know anything about deep learning or gradients or computational graphs, and is just a way to perform generic numeric computations.
# N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. -N <- 64 -D_in <- 1000 -H <- 100 -D_out <- 10 +N <- 64 +D_in <- 1000 +H <- 100 +D_out <- 10 # Create random input and output data -x <- array(rnorm(N*D_in), dim = c(N, D_in)) -y <- array(rnorm(N*D_out), dim = c(N, D_out)) +x <- array(rnorm(N*D_in), dim = c(N, D_in)) +y <- array(rnorm(N*D_out), dim = c(N, D_out)) # Randomly initialize weights -w1 <- array(rnorm(D_in*H), dim = c(D_in, H)) -w2 <- array(rnorm(H*D_out), dim = c(H, D_out)) +w1 <- array(rnorm(D_in*H), dim = c(D_in, H)) +w2 <- array(rnorm(H*D_out), dim = c(H, D_out)) -learning_rate <- 1e-6 -for (t in seq_len(500)) { +learning_rate <- 1e-6 +for (t in seq_len(500)) { # Forward pass: compute predicted y - h <- x %*% w1 - h_relu <- ifelse(h < 0, 0, h) - y_pred <- h_relu %*% w2 + h <- x %*% w1 + h_relu <- ifelse(h < 0, 0, h) + y_pred <- h_relu %*% w2 # Compute and print loss - loss <- sum((y_pred - y)^2) - if (t %% 100 == 0 || t == 1) - cat("Step:", t, ":", loss, "\n") + loss <- sum((y_pred - y)^2) + if (t %% 100 == 0 || t == 1) + cat("Step:", t, ":", loss, "\n") # Backprop to compute gradients of w1 and w2 with respect to loss - grad_y_pred <- 2 * (y_pred - y) - grad_w2 <- t(h_relu) %*% grad_y_pred - grad_h_relu <- grad_y_pred %*% t(w2) - grad_h <- grad_h_relu - grad_h[h < 0] <- 0 - grad_w1 <- t(x) %*% grad_h + grad_y_pred <- 2 * (y_pred - y) + grad_w2 <- t(h_relu) %*% grad_y_pred + grad_h_relu <- grad_y_pred %*% t(w2) + grad_h <- grad_h_relu + grad_h[h < 0] <- 0 + grad_w1 <- t(x) %*% grad_h # Update weights - w1 <- w1 - learning_rate * grad_w1 - w2 <- w2 - learning_rate * grad_w2 -} -#> Step: 1 : 28115720 -#> Step: 100 : 536.8496 -#> Step: 200 : 2.748443 -#> Step: 300 : 0.01913319 -#> Step: 400 : 0.0001405911 -#> Step: 500 : 1.056453e-06 -
In the next example we will replace the R array for a torch Tensor.
diff --git a/articles/getting-started/what-is-torch.html b/articles/getting-started/what-is-torch.html index cf83c6ada393e32104e3af08ea96f1f344cf429a..3f7253688ad2586b663b2adc3ba2d43a57cf1e90 100644 --- a/articles/getting-started/what-is-torch.html +++ b/articles/getting-started/what-is-torch.html @@ -11,12 +11,19 @@ - + + +Note: This is an R port of the official tutorial available here. All credits goes to Soumith Chintala.
+library(torch)It’s a scientific computing package targeted at two sets of audiences:
Construct a 5x3 matrix, uninitialized:
-x <- torch_empty(5, 3) -x +x <- torch_empty(5, 3) +x #> torch_tensor -#> 0.0000e+00 1.5846e+29 1.7045e+23 -#> -1.5849e+29 7.0065e-45 0.0000e+00 -#> 0.0000e+00 0.0000e+00 0.0000e+00 -#> 0.0000e+00 0.0000e+00 0.0000e+00 -#> 0.0000e+00 0.0000e+00 0.0000e+00 -#> [ CPUFloatType{5,3} ] -
Construct a randomly initialized matrix:
-x <- torch_rand(5, 3) -x +x <- torch_rand(5, 3) +x #> torch_tensor -#> 0.2655 0.4783 0.8396 -#> 0.2444 0.6980 0.7983 -#> 0.6614 0.9127 0.8649 -#> 0.8563 0.3856 0.0944 -#> 0.8246 0.9818 0.8342 -#> [ CPUFloatType{5,3} ] -
Construct a matrix filled zeros and of dtype long:
-x <- torch_zeros(5, 3, dtype = torch_long()) -x +x <- torch_zeros(5, 3, dtype = torch_long()) +x #> torch_tensor #> 0 0 0 #> 0 0 0 #> 0 0 0 #> 0 0 0 #> 0 0 0 -#> [ CPULongType{5,3} ] -
Construct a tensor directly from data:
-x <- torch_tensor(c(5.5, 3)) -x +x <- torch_tensor(c(5.5, 3)) +x #> torch_tensor #> 5.5000 #> 3.0000 -#> [ CPUFloatType{2} ] -
or create a tensor based on an existing tensor. These methods will reuse properties of the input tensor, e.g. dtype, unless new values are provided by user
-x <- torch_randn_like(x, dtype = torch_float()) # override dtype! -x # result has the same size +x <- torch_randn_like(x, dtype = torch_float()) # override dtype! +x # result has the same size #> torch_tensor -#> 0.7172 -#> 0.9112 -#> [ CPUFloatType{2} ] -
Get its size:
-x$size() -#> [1] 2 -
There are multiple syntaxes for operations. In the following example, we will take a look at the addition operation.
Addition: syntax 1
-x <- torch_rand(5, 3) -y <- torch_rand(5, 3) -x + y +x <- torch_rand(5, 3) +y <- torch_rand(5, 3) +x + y #> torch_tensor -#> 0.7737 0.8053 0.3150 -#> 1.3053 1.0479 0.5301 -#> 1.6027 0.7272 0.9115 -#> 1.3239 0.8749 0.7270 -#> 1.1298 1.3922 0.7527 -#> [ CPUFloatType{5,3} ] -
Addition: syntax 2
-torch_add(x, y) +torch_add(x, y) #> torch_tensor -#> 0.7737 0.8053 0.3150 -#> 1.3053 1.0479 0.5301 -#> 1.6027 0.7272 0.9115 -#> 1.3239 0.8749 0.7270 -#> 1.1298 1.3922 0.7527 -#> [ CPUFloatType{5,3} ] -
Addition: in-place
-y$add_(x) +y$add_(x) #> torch_tensor -#> 0.7737 0.8053 0.3150 -#> 1.3053 1.0479 0.5301 -#> 1.6027 0.7272 0.9115 -#> 1.3239 0.8749 0.7270 -#> 1.1298 1.3922 0.7527 +#> 1.0605 0.8204 0.2822 +#> 1.3213 1.3433 0.6433 +#> 1.8328 0.3214 0.6732 +#> 1.2971 1.3126 1.6909 +#> 0.6892 1.0732 1.5375 #> [ CPUFloatType{5,3} ] -y +y #> torch_tensor -#> 0.7737 0.8053 0.3150 -#> 1.3053 1.0479 0.5301 -#> 1.6027 0.7272 0.9115 -#> 1.3239 0.8749 0.7270 -#> 1.1298 1.3922 0.7527 -#> [ CPUFloatType{5,3} ] -
Note: Any operation that mutates a tensor in-place is post-fixed with an
_. For example:x$copy_(y),x$t_(), will change x.
You can use standard R-like indexing with all bells and whistles! See more about indexing with vignette("indexing").
-x[, 1] +x[, 1] #> torch_tensor -#> 0.4454 -#> 0.5480 -#> 0.7439 -#> 0.7984 -#> 0.9449 -#> [ CPUFloatType{5} ] -
Resizing: If you want to resize/reshape tensor, you can use torch_view:
-x <- torch_randn(4, 4) -y <- x$view(16) -z <- x$view(size = c(-1, 8)) # the size -1 is inferred from other dimensions -x$size() +x <- torch_randn(4, 4) +y <- x$view(16) +z <- x$view(size = c(-1, 8)) # the size -1 is inferred from other dimensions +x$size() #> [1] 4 4 -y$size() +y$size() #> [1] 16 -z$size() -#> [1] 2 8 -
If you have a one element tensor, use $item() to get the value as an R number
-x <- torch_randn(1) -x +x <- torch_randn(1) +x #> torch_tensor -#> 1.1464 +#> -1.6019 #> [ CPUFloatType{1} ] -x$item() -#> [1] 1.146403 -
You can find a complete list of operations in the reference page.
@@ -330,39 +345,36 @@-a <- torch_ones(5) -a +a <- torch_ones(5) +a #> torch_tensor #> 1 #> 1 #> 1 #> 1 #> 1 -#> [ CPUFloatType{5} ] -
-b <- as_array(a) -b -#> [1] 1 1 1 1 1 -
-a <- rep(1, 5) -a +a <- rep(1, 5) +a #> [1] 1 1 1 1 1 -b <- torch_tensor(a) -b +b <- torch_tensor(a) +b #> torch_tensor #> 1 #> 1 #> 1 #> 1 #> 1 -#> [ CPUFloatType{5} ] -
Currently supported types are numerics and boolean types.
@@ -371,15 +383,14 @@ CUDA tensorsTensors can be moved onto any device using the $to method.
-if (cuda_is_available()) { - device <- torch_device("cuda") - y <- torch_ones_like(x, device = device) # directly create a tensor on GPU - x <- x$to(device) # or just use strings ``.to("cuda")`` - z <- x + y - print(z) - print(z$to(device = "cpu", torch_double())) # `$to` can also change dtype together! -} -
In this article we describe the indexing operator for torch tensors and how it compares to the R indexing operator for arrays.
Torch’s indexing semantics are closer to numpy’s semantics than R’s. You will find a lot of similarities between this article and the numpy indexing article available here.
Single element indexing for a 1-D tensors works mostly as expected. Like R, it is 1-based. Unlike R though, it accepts negative indices for indexing from the end of the array. (In R, negative indices are used to remove elements.)
-x <- torch_tensor(1:10) -x[1] +x <- torch_tensor(1:10) +x[1] #> torch_tensor #> 1 #> [ CPULongType{} ] -x[-1] +x[-1] #> torch_tensor #> 10 -#> [ CPULongType{} ] -
You can also subset matrices and higher dimensions arrays using the same syntax:
-x <- x$reshape(shape = c(2,5)) -x +x <- x$reshape(shape = c(2,5)) +x #> torch_tensor #> 1 2 3 4 5 #> 6 7 8 9 10 #> [ CPULongType{2,5} ] -x[1,3] +x[1,3] #> torch_tensor #> 3 #> [ CPULongType{} ] -x[1,-1] +x[1,-1] #> torch_tensor #> 5 -#> [ CPULongType{} ] -
Note that if one indexes a multidimensional tensor with fewer indices than dimensions, one gets an error, unlike in R that would flatten the array. For example:
-x[1] +x[1] #> torch_tensor #> 1 #> 2 #> 3 #> 4 #> 5 -#> [ CPULongType{5} ] -
It is possible to slice and stride arrays to extract sub-arrays of the same number of dimensions, but of different sizes than the original. This is best illustrated by a few examples:
-x <- torch_tensor(1:10) -x +x <- torch_tensor(1:10) +x #> torch_tensor #> 1 #> 2 @@ -223,33 +246,31 @@ #> 9 #> 10 #> [ CPULongType{10} ] -x[2:5] +x[2:5] #> torch_tensor #> 2 #> 3 #> 4 #> 5 #> [ CPULongType{4} ] -x[1:(-7)] +x[1:(-7)] #> torch_tensor #> 1 #> 2 #> 3 #> 4 -#> [ CPULongType{4} ] -
You can also use the 1:10:2 syntax which means: In the range from 1 to 10, take every second item. For example:
-x[1:5:2] +x[1:5:2] #> torch_tensor #> 1 #> 3 #> 5 -#> [ CPULongType{3} ] -
Another special syntax is the N, meaning the size of the specified dimension.
-x[5:N] +x[5:N] #> torch_tensor #> 5 #> 6 @@ -257,8 +278,7 @@ #> 8 #> 9 #> 10 -#> [ CPULongType{6} ] -
Like in R, you can take all elements in a dimension by leaving an index empty.
Consider a matrix:
-x <- torch_randn(2, 3) -x +x <- torch_randn(2, 3) +x #> torch_tensor -#> 1.4158 0.9219 -0.1461 -#> 0.9801 0.7556 0.2140 -#> [ CPUFloatType{2,3} ] -
The following syntax will give you the first row:
-x[1,] +x[1,] #> torch_tensor -#> 1.4158 -#> 0.9219 -#> -0.1461 -#> [ CPUFloatType{3} ] -
And this would give you the first 2 columns:
-x[,1:2] +x[,1:2] #> torch_tensor -#> 1.4158 0.9219 -#> 0.9801 0.7556 -#> [ CPUFloatType{2,2} ] -
By default, when indexing by a single integer, this dimension will be dropped to avoid the singleton dimension:
-x <- torch_randn(2, 3) -x[1,]$shape -#> [1] 3 -
You can optionally use the drop = FALSE argument to avoid dropping the dimension.
-x[1,,drop = FALSE]$shape -#> [1] 1 3 -
It’s possible to add a new dimension to a tensor using index-like syntax:
-x <- torch_tensor(c(10)) -x$shape +x <- torch_tensor(c(10)) +x$shape #> [1] 1 -x[, newaxis]$shape +x[, newaxis]$shape #> [1] 1 1 -x[, newaxis, newaxis]$shape -#> [1] 1 1 1 -
You can also use NULL instead of newaxis:
-x[,NULL]$shape -#> [1] 1 1 -
Sometimes we don’t know how many dimensions a tensor has, but we do know what to do with the last available dimension, or the first one. To subsume all others, we can use ..:
-z <- torch_tensor(1:125)$reshape(c(5,5,5)) -z[1,..] +z <- torch_tensor(1:125)$reshape(c(5,5,5)) +z[1,..] #> torch_tensor #> 1 2 3 4 5 #> 6 7 8 9 10 @@ -339,15 +352,14 @@ #> 16 17 18 19 20 #> 21 22 23 24 25 #> [ CPULongType{5,5} ] -z[..,1] +z[..,1] #> torch_tensor #> 1 6 11 16 21 #> 26 31 36 41 46 #> 51 56 61 66 71 #> 76 81 86 91 96 #> 101 106 111 116 121 -#> [ CPULongType{5,5} ] -
-library(palmerpenguins) -library(magrittr) +library(palmerpenguins) +library(magrittr) -penguins +penguins #> # A tibble: 344 x 8 #> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g #> <fct> <fct> <dbl> <dbl> <int> <int> @@ -203,52 +229,50 @@ #> 8 Adelie Torge… 39.2 19.6 195 4675 #> 9 Adelie Torge… 34.1 18.1 193 3475 #> 10 Adelie Torge… 42 20.2 190 4250 -#> # … with 334 more rows, and 2 more variables: sex <fct>, year <int> -
Datasets are R6 classes created using the dataset() constructor. You can pass a name and various member functions. Among those should be initialize(), to create instance variables, .getitem(), to indicate how the data should be returned, and .length(), to say how many items we have.
In addition, any number of helper functions can be defined.
Here, we assume the penguins have already been loaded, and all preprocessing consists in removing lines with NA values, transforming factors to numbers starting from 0, and converting from R data types to torch tensors.
In .getitem, we essentially decide how this data is going to be used: All variables besides species go into x, the predictor, and species will constitute y, the target. Predictor and target are returned in a list, to be accessed as batch[[1]] and batch[[2]] during training.
-penguins_dataset <- dataset( +penguins_dataset <- dataset( - name = "penguins_dataset", + name = "penguins_dataset", - initialize = function() { - self$data <- self$prepare_penguin_data() - }, + initialize = function() { + self$data <- self$prepare_penguin_data() + }, - .getitem = function(index) { + .getitem = function(index) { - x <- self$data[index, 2:-1] - y <- self$data[index, 1]$to(torch_long()) + x <- self$data[index, 2:-1] + y <- self$data[index, 1]$to(torch_long()) - list(x, y) - }, + list(x, y) + }, - .length = function() { - self$data$size()[[1]] - }, + .length = function() { + self$data$size()[[1]] + }, - prepare_penguin_data = function() { + prepare_penguin_data = function() { - input <- na.omit(penguins) + input <- na.omit(penguins) # conveniently, the categorical data are already factors - input$species <- as.numeric(input$species) - input$island <- as.numeric(input$island) - input$sex <- as.numeric(input$sex) + input$species <- as.numeric(input$species) + input$island <- as.numeric(input$island) + input$sex <- as.numeric(input$sex) - input <- as.matrix(input) - torch_tensor(input) - } -) -
Let’s create the dataset , query for it’s length, and look at its first item:
-tuxes <- penguins_dataset() -tuxes$.length() +tuxes <- penguins_dataset() +tuxes$.length() #> [1] 333 -tuxes$.getitem(1) +tuxes$.getitem(1) #> [[1]] #> torch_tensor #> 3.0000 @@ -263,22 +287,19 @@ #> [[2]] #> torch_tensor #> 1 -#> [ CPULongType{} ] -
To be able to iterate over tuxes, we need a data loader (we override the default batch size of 1):
-dl <-tuxes %>% dataloader(batch_size = 8) -
Calling .length() on a data loader (as opposed to a dataset) will return the number of batches we have:
-dl$.length() -#> [1] 42 -
And we can create an iterator to inspect the first batch:
-iter <- dl$.iter() -b <- iter$.next() -b +iter <- dl$.iter() +b <- iter$.next() +b #> [[1]] #> torch_tensor #> 3.0000 39.1000 18.7000 181.0000 3750.0000 2.0000 2007.0000 @@ -301,8 +322,7 @@ #> 1 #> 1 #> 1 -#> [ CPULongType{8} ] -
To train a network, we can use enumerate to iterate over batches.
Our example network is very simple. (In reality, we would want to treat island as the categorical variable it is, and either one-hot-encode or embed it.)
-net <- nn_module( +net <- nn_module( "PenguinNet", - initialize = function() { - self$fc1 <- nn_linear(7, 32) - self$fc2 <- nn_linear(32, 3) - }, - forward = function(x) { - x %>% - self$fc1() %>% - nnf_relu() %>% - self$fc2() %>% - nnf_log_softmax(dim = 1) - } -) + initialize = function() { + self$fc1 <- nn_linear(7, 32) + self$fc2 <- nn_linear(32, 3) + }, + forward = function(x) { + x %>% + self$fc1() %>% + nnf_relu() %>% + self$fc2() %>% + nnf_log_softmax(dim = 1) + } +) -model <- net() -
We still need an optimizer:
-optimizer <- optim_sgd(model$parameters, lr = 0.01) -
And we’re ready to train:
-for (epoch in 1:10) { +for (epoch in 1:10) { - l <- c() + l <- c() - for (b in enumerate(dl)) { - optimizer$zero_grad() - output <- model(b[[1]]) - loss <- nnf_nll_loss(output, b[[2]]) - loss$backward() - optimizer$step() - l <- c(l, loss$item()) - } + for (b in enumerate(dl)) { + optimizer$zero_grad() + output <- model(b[[1]]) + loss <- nnf_nll_loss(output, b[[2]]) + loss$backward() + optimizer$step() + l <- c(l, loss$item()) + } - cat(sprintf("Loss at epoch %d: %3f\n", epoch, mean(l))) -} + cat(sprintf("Loss at epoch %d: %3f\n", epoch, mean(l))) +} #> Loss at epoch 1: 51.747068 #> Loss at epoch 2: 2.068251 #> Loss at epoch 3: 2.068251 @@ -357,8 +375,7 @@ #> Loss at epoch 7: 2.068251 #> Loss at epoch 8: 2.068251 #> Loss at epoch 9: 2.068251 -#> Loss at epoch 10: 2.068251 -
Torch tensors in R are pointers to Tensors allocated by LibTorch. This has one major consequence for serialization. One cannot simply use saveRDS for serializing tensors, as you would save the pointer but not the data itself. When reloading a tensor saved with saveRDS the pointer might have been deleted in LibTorch and you would get wrong results.
To solve this problem, torch implements specialized functions for serializing tensors to the disk:
torch_save(): to save tensors and models to the disk.torch_load(): to load the models or tensors back to the session.Please note that this format is still experimental and you shouldn’t use it for long term storage.
+You can save any object of type torch_tensor to the disk using:
+x <- torch_randn(10, 10) +torch_save(x, "tensor.pt") +x_ <- torch_load("tensor.pt") + +torch_allclose(x, x_) +#> [1] TRUE
The torch_save and torch_load functions also work for nn_modules objects.
When saving an nn_module, all the object is serialized including the model structure and it’s state.
+module <- nn_module( + "my_module", + initialize = function() { + self$fc1 <- nn_linear(10, 10) + self$fc2 <- nn_linear(10, 1) + }, + forward = function(x) { + x %>% + self$fc1() %>% + self$fc2() + } +) + +model <- module() +torch_save(model, "model.pt") +model_ <- torch_load("model.pt") + +# input tensor +x <- torch_randn(50, 10) +torch_allclose(model(x), model_(x)) +#> [1] TRUE
Currently the only way to load models from python is to rewrite the model architecture in R. All the parameter names must be identical.
+You can then save the PyTorch model state_dict using:
+torch.save(model, fpath, _use_new_zipfile_serialization=True)
+You can then reload the state dict in R and reload it into the model with:
++state_dict <- load_state_dict(fpath) +model <- Model() +model$load_state_dict(state_dict)
You can find working examples in torchvision. For example this is what we do for the AlexNet model.
In this article we describe various ways of creating torch tensors in R.
You can create tensors from R objects using the torch_tensor function. The torch_tensor function takes an R vector, matrix or array and creates an equivalent torch_tensor.
You can see a few examples below:
-torch_tensor(c(1,2,3)) +torch_tensor(c(1,2,3)) #> torch_tensor #> 1 #> 2 @@ -173,37 +199,35 @@ #> [ CPUFloatType{3} ] # conform to row-major indexing used in torch -torch_tensor(matrix(1:10, ncol = 5, nrow = 2, byrow = TRUE)) +torch_tensor(matrix(1:10, ncol = 5, nrow = 2, byrow = TRUE)) #> torch_tensor #> 1 2 3 4 5 #> 6 7 8 9 10 #> [ CPULongType{2,5} ] -torch_tensor(array(runif(12), dim = c(2, 2, 3))) +torch_tensor(array(runif(12), dim = c(2, 2, 3))) #> torch_tensor #> (1,.,.) = -#> 0.5612 0.4325 0.6571 -#> 0.4899 0.4636 0.9910 +#> 0.8530 0.3081 0.4772 +#> 0.3160 0.0449 0.0551 #> #> (2,.,.) = -#> 0.7073 0.8791 0.3117 -#> 0.9112 0.6383 0.1045 -#> [ CPUFloatType{2,2,3} ] -
By default, we will create tensors in the cpu device, converting their R datatype to the corresponding torch dtype.
Note currently, only numeric and boolean types are supported.
You can always modify dtype and device when converting an R object to a torch tensor. For example:
-torch_tensor(1, dtype = torch_long()) +torch_tensor(1, dtype = torch_long()) #> torch_tensor #> 1 #> [ CPULongType{1} ] -torch_tensor(1, device = "cpu", dtype = torch_float64()) +torch_tensor(1, device = "cpu", dtype = torch_float64()) #> torch_tensor #> 1 -#> [ CPUDoubleType{1} ] -
Other options available when creating a tensor are:
You can also use the torch_* functions listed below to create torch tensors using some algorithm.
For example, the torch_randn function will create tensors using the normal distribution with mean 0 and standard deviation 1. You can use the ... argument to pass the size of the dimensions. For example, the code below will create a normally distributed tensor with shape 5x3.
-x <- torch_randn(5, 3) -x +x <- torch_randn(5, 3) +x #> torch_tensor -#> -1.5887 -0.0033 1.0389 -#> 0.0472 -1.0173 -1.5143 -#> 1.9183 -0.6090 -0.9197 -#> 1.7162 -1.8687 0.8053 -#> 1.0018 0.6406 -0.5853 -#> [ CPUFloatType{5,3} ] -
Another example is torch_ones, which creates a tensor filled with ones.
-x <- torch_ones(2, 4, dtype = torch_int64(), device = "cpu") -x +x <- torch_ones(2, 4, dtype = torch_int64(), device = "cpu") +x #> torch_tensor #> 1 1 1 1 #> 1 1 1 1 -#> [ CPULongType{2,4} ] -
Here is the full list of functions that can be used to bulk-create tensors in torch:
Once a tensor exists you can convert between dtypes and move to a different device with to method. For example:
-x <- torch_tensor(1) -y <- x$to(dtype = torch_int32()) -x +x <- torch_tensor(1) +y <- x$to(dtype = torch_int32()) +x #> torch_tensor #> 1 #> [ CPUFloatType{1} ] -y +y #> torch_tensor #> 1 -#> [ CPUIntType{1} ] -
You can also copy a tensor to the GPU using:
diff --git a/articles/tensor/index.html b/articles/tensor/index.html index ac0649d5127f07c228ec3f4dc2199f9fb497542b..998d0c4ff07b65e106fc960fcb359050ee5a102a 100644 --- a/articles/tensor/index.html +++ b/articles/tensor/index.html @@ -11,12 +11,19 @@ - + + +Central to torch is the torch_tensor objects. torch_tensor’s are R objects very similar to R6 instances. Tensors have a large amount of methods that can be called using the $ operator.
Following is a list of all methods that can be called by tensor objects and their documentation. You can also look at PyTorch’s documentation for additional details.
# Example 1: Applying a mask -mask <- torch_randint(low = 0, high = 2, size = c(127, 128), dtype=torch_bool())$refine_names(c('W', 'H')) -imgs <- torch_randn(32, 128, 127, 3, names=c('N', 'H', 'W', 'C')) -imgs$masked_fill_(mask$align_as(imgs), 0) +mask <- torch_randint(low = 0, high = 2, size = c(127, 128), dtype=torch_bool())$refine_names(c('W', 'H')) +imgs <- torch_randn(32, 128, 127, 3, names=c('N', 'H', 'W', 'C')) +imgs$masked_fill_(mask$align_as(imgs), 0) # Example 2: Applying a per-channel-scale -scale_channels <- function(input, scale) { - scale <- scale$refine_names("C") - input * scale$align_as(input) -} +scale_channels <- function(input, scale) { + scale <- scale$refine_names("C") + input * scale$align_as(input) +} -num_channels <- 3 -scale <- torch_randn(num_channels, names='C') -imgs <- torch_rand(32, 128, 128, num_channels, names=c('N', 'H', 'W', 'C')) -more_imgs = torch_rand(32, num_channels, 128, 128, names=c('N', 'C', 'H', 'W')) -videos = torch_randn(3, num_channels, 128, 128, 128, names=c('N', 'C', 'H', 'W', 'D')) +num_channels <- 3 +scale <- torch_randn(num_channels, names='C') +imgs <- torch_rand(32, 128, 128, num_channels, names=c('N', 'H', 'W', 'C')) +more_imgs = torch_rand(32, num_channels, 128, 128, names=c('N', 'C', 'H', 'W')) +videos = torch_randn(3, num_channels, 128, 128, 128, names=c('N', 'C', 'H', 'W', 'D')) # scale_channels is agnostic to the dimension order of the input -scale_channels(imgs, scale) -scale_channels(more_imgs, scale) -scale_channels(videos, scale) -
-a <- torch_rand(1, 2)$to(dtype = torch_bool()) -a -a$all() -
all(dim, keepdim=FALSE, out=NULL) -> Tensor
Returns TRUE if all elements in each row of the tensor in the given dimension dim are TRUE, FALSE otherwise.
If keepdim is TRUE, the output tensor is of the same size as input except in the dimension dim where it is of size 1. Otherwise, dim is squeezed (see ?torch_squeeze()), resulting in the output tensor having 1 fewer dimension than input.
-a <- torch_rand(4, 2)$to(dtype = torch_bool()) -a -a$all(dim=2) -a$all(dim=1) -
-a <- torch_rand(1, 2)$to(dtype = torch_bool()) -a -a$any() -
any(dim, keepdim=FALSE, out=NULL) -> Tensor
Returns TRUE if any elements in each row of the tensor in the given dimension dim are TRUE, FALSE otherwise.
If keepdim is TRUE, the output tensor is of the same size as input except in the dimension dim where it is of size 1. Otherwise, dim is squeezed (see ?torch_squeeze()), resulting in the output tensor having 1 fewer dimension than input.
-a <- torch_randn(4, 2) < 0 -a -a$any(2) -a$any(1) -
-torch_tensor(c(1))$element_size() -
-x <- torch_tensor(matrix(c(1,2,3), ncol = 1)) -x$size() -x$expand(c(3, 4)) -x$expand(c(-1, 4)) # -1 means not changing the size of that dimension -
-a <- torch_zeros(3, 3) -a$fill_diagonal_(5) -b <- torch_zeros(7, 3) -b$fill_diagonal_(5) -c <- torch_zeros(7, 3) -c$fill_diagonal_(5, wrap=TRUE) -
-x <- torch_randn(3, 4, 5, device='cuda:0') -x$get_device() -x$cpu()$get_device() # RuntimeError: get_device is not implemented for type torch_FloatTensor -
-x <- torch_randn(4, dtype=torch_cfloat()) -x -x$imag -
-x <- torch_ones(5, 3) -t <- torch_tensor(matrix(1:9, ncol = 3), dtype=torch_float()) -index <- torch_tensor(c(1L, 4L, 3L)) -x$index_add_(1, index, t) -
-x <- torch_zeros(5, 3) -t <- torch_tensor(matrix(1:9, ncol = 3), dtype=torch_float()) -index <- torch_tensor(c(1, 5, 3)) -x$index_copy_(1, index, t) -
-x <- torch_tensor(matrix(1:9, ncol = 3), dtype=torch_float()) -index <- torch_tensor(c(1, 3), dtype = torch_long()) -x$index_fill_(1, index, -1) -
-a <- torch_rand(10, requires_grad=TRUE) -a$is_leaf() +a <- torch_rand(10, requires_grad=TRUE) +a$is_leaf() # b <- torch_rand(10, requires_grad=TRUE)$cuda() # b$is_leaf() # FALSE # b was created by the operation that cast a cpu Tensor into a cuda Tensor -c <- torch_rand(10, requires_grad=TRUE) + 2 -c$is_leaf() +c <- torch_rand(10, requires_grad=TRUE) + 2 +c$is_leaf() # c was created by the addition operation # d <- torch_rand(10)$cuda() @@ -1689,8 +1702,7 @@ f(X=k) = p^{k - 1} (1 - p) # f <- torch_rand(10, requires_grad=TRUE, device="cuda") # f$is_leaf # TRUE -# f requires grad, has no operation creating it -
-x <- torch_tensor(1.0) -x$item() -
-x <- torch_tensor(matrix(1:9, ncol = 3)) -x$narrow(1, 1, 3) -x$narrow(1, 1, 2) -
-tensor <- torch_ones(5) -tensor$new_empty(c(2, 3)) -
-tensor <- torch_ones(c(2), dtype=torch_float64()) -tensor$new_full(c(3, 4), 3.141592) -
-tensor <- torch_tensor(c(2), dtype=torch_int32()) -tensor$new_ones(c(2, 3)) -
-tensor <- torch_ones(c(2), dtype=torch_int8) -data <- matrix(1:4, ncol = 2) -tensor$new_tensor(data) -
-tensor <- torch_tensor(c(1), dtype=torch_float64()) -tensor$new_zeros(c(2, 3)) -
-x <- torch_randn(2, 3, 5) -x$size() -x$permute(c(3, 1, 2))$size() -
-src <- torch_tensor(matrix(3:8, ncol = 3)) -src$put_(torch_tensor(1:2), torch_tensor(9:10)) -
-x <- torch_randn(4, dtype=torch_cfloat()) -x -x$real -
-imgs <- torch_randn(32, 3, 128, 128) -named_imgs <- imgs$refine_names(c('N', 'C', 'H', 'W')) -named_imgs$names -
-v <- torch_tensor(c(0., 0., 0.), requires_grad=TRUE) -h <- v$register_hook(function(grad) grad * 2) # double the gradient -v$backward(torch_tensor(c(1., 2., 3.))) -v$grad -h$remove() -
-imgs <- torch_rand(2, 3, 5, 7, names=c('N', 'C', 'H', 'W')) -renamed_imgs <- imgs$rename(c("Batch", "Channels", "Height", "Width")) -
-x <- torch_tensor(c(1, 2, 3)) -x$`repeat`(c(4, 2)) -x$`repeat`(c(4, 2, 1))$size() -
# Let's say we want to preprocess some saved weights and use # the result as new weights. -saved_weights <- c(0.1, 0.2, 0.3, 0.25) -loaded_weights <- torch_tensor(saved_weights) -weights <- preprocess(loaded_weights) # some function -weights +saved_weights <- c(0.1, 0.2, 0.3, 0.25) +loaded_weights <- torch_tensor(saved_weights) +weights <- preprocess(loaded_weights) # some function +weights # Now, start to record operations done to weights -weights$requires_grad_() -out <- weights$pow(2)$sum() -out$backward() -weights$grad -
-x <- torch_tensor(matrix(1:6, ncol = 2)) -x$resize_(c(2, 2)) -
-x <- torch_rand(2, 5) -x -torch_zeros(3, 5)$scatter_( +x <- torch_rand(2, 5) +x +torch_zeros(3, 5)$scatter_( 1, - torch_tensor(rbind(c(2, 3, 3, 1, 1), c(3, 1, 1, 2, 3)), x) -) + torch_tensor(rbind(c(2, 3, 3, 1, 1), c(3, 1, 1, 2, 3)), x) +) -z <- torch_zeros(2, 4)$scatter_( +z <- torch_zeros(2, 4)$scatter_( 2, - torch_tensor(matrix(3:4, ncol = 1)), 1.23 -) -
-x <- torch_rand(2, 5) -x -torch_ones(3, 5)$scatter_add_(1, torch_tensor(rbind(c(0, 1, 2, 0, 0), c(2, 0, 0, 1, 2))), x) -
-torch_empty(3, 4, 5)$size() -
-x <- torch_tensor(c(1, 2, 3, 4, 5)) -x$storage_offset() -x[3:N]$storage_offset() -
-x <- torch_tensor(matrix(1:10, nrow = 2)) -x$stride() -x$stride(1) -x$stride(-1) -
-tensor <- torch_randn(2, 2) # Initially dtype=float32, device=cpu -tensor$to(dtype = torch_float64()) +tensor <- torch_randn(2, 2) # Initially dtype=float32, device=cpu +tensor$to(dtype = torch_float64()) -other <- torch_randn(1, dtype=torch_float64()) -tensor$to(other = other, non_blocking=TRUE) -
So far, all we’ve been using from torch is tensors, but we’ve been performing all calculations ourselves – the computing the predictions, the loss, the gradients (and thus, the necessary updates to the weights), and the new weight values. In this chapter, we’ll make a significant change: Namely, we spare ourselves the cumbersome calculation of gradients, and have torch do it for us.
Before we see that in action, let’s get some more background.
Torch uses a module called autograd to record operations performed on tensors, and store what has to be done to obtain the respective gradients. These actions are stored as functions, and those functions are applied in order when the gradient of the output (normally, the loss) with respect to those tensors is calculated: starting from the output node and propagating gradients back through the network. This is a form of reverse mode automatic differentiation.
As users, we can see a bit of this implementation. As a prerequisite for this “recording” to happen, tensors have to be created with requires_grad = TRUE. E.g.
-x <- torch_ones(2,2, requires_grad = TRUE) -
To be clear, this is a tensor with respect to which gradients have to be calculated – normally, a tensor representing a weight or a bias, not the input data 1. If we now perform some operation on that tensor, assigning the result to y
-y <- x$mean() -
we find that y now has a non-empty grad_fn that tells torch how to compute the gradient of y with respect to x:
-y$grad_fn -#> MeanBackward0 -
Actual computation of gradients is triggered by calling backward() on the output tensor.
-y$backward() -
That executed, x now has a non-empty field grad that stores the gradient of y with respect to x:
-x$grad +x$grad #> torch_tensor #> 0.2500 0.2500 #> 0.2500 0.2500 -#> [ CPUFloatType{2,2} ] -
With a longer chain of computations, we can peek at how torch builds up a graph of backward operations.
Here is a slightly more complex example. We call retain_grad() on y and z just for demonstration purposes; by default, intermediate gradients – while of course they have to be computed – aren’t stored, in order to save memory.
-x1 <- torch_ones(2,2, requires_grad = TRUE) -x2 <- torch_tensor(1.1, requires_grad = TRUE) -y <- x1 * (x2 + 2) -y$retain_grad() -z <- y$pow(2) * 3 -z$retain_grad() -out <- z$mean() -
Starting from out$grad_fn, we can follow the graph all back to the leaf nodes:
# how to compute the gradient for mean, the last operation executed -out$grad_fn +out$grad_fn #> MeanBackward0 # how to compute the gradient for the multiplication by 3 in z = y$pow(2) * 3 -out$grad_fn$next_functions +out$grad_fn$next_functions #> [[1]] #> MulBackward1 # how to compute the gradient for pow in z = y.pow(2) * 3 -out$grad_fn$next_functions[[1]]$next_functions +out$grad_fn$next_functions[[1]]$next_functions #> [[1]] #> PowBackward0 # how to compute the gradient for the multiplication in y = x * (x + 2) -out$grad_fn$next_functions[[1]]$next_functions[[1]]$next_functions +out$grad_fn$next_functions[[1]]$next_functions[[1]]$next_functions #> [[1]] #> MulBackward0 # how to compute the gradient for the two branches of y = x * (x + 2), # where the left branch is a leaf node (AccumulateGrad for x1) -out$grad_fn$next_functions[[1]]$next_functions[[1]]$next_functions[[1]]$next_functions +out$grad_fn$next_functions[[1]]$next_functions[[1]]$next_functions[[1]]$next_functions #> [[1]] #> torch::autograd::AccumulateGrad #> [[2]] #> AddBackward1 # here we arrive at the other leaf node (AccumulateGrad for x2) -out$grad_fn$next_functions[[1]]$next_functions[[1]]$next_functions[[1]]$next_functions[[2]]$next_functions +out$grad_fn$next_functions[[1]]$next_functions[[1]]$next_functions[[1]]$next_functions[[2]]$next_functions #> [[1]] -#> torch::autograd::AccumulateGrad -
After calling out$backward(), all tensors in the graph will have their respective gradients created. Without our calls to retain_grad above, z$grad and y$grad would be empty:
-out$backward() -z$grad +out$backward() +z$grad #> torch_tensor #> 0.2500 0.2500 #> 0.2500 0.2500 #> [ CPUFloatType{2,2} ] -y$grad +y$grad #> torch_tensor #> 4.6500 4.6500 #> 4.6500 4.6500 #> [ CPUFloatType{2,2} ] -x2$grad +x2$grad #> torch_tensor #> 18.6000 #> [ CPUFloatType{1} ] -x1$grad +x1$grad #> torch_tensor #> 14.4150 14.4150 #> 14.4150 14.4150 -#> [ CPUFloatType{2,2} ] -
Thus acquainted with autograd, we’re ready to modify our example.
### generate training data ----------------------------------------------------- # input dimensionality (number of input features) -d_in <- 3 +d_in <- 3 # output dimensionality (number of predicted features) -d_out <- 1 +d_out <- 1 # number of observations in training set -n <- 100 +n <- 100 # create random data -x <- torch_randn(n, d_in) -y <- x[,1]*0.2 - x[..,2]*1.3 - x[..,3]*0.5 + torch_randn(n) -y <- y$unsqueeze(dim = 1) +x <- torch_randn(n, d_in) +y <- x[,1]*0.2 - x[..,2]*1.3 - x[..,3]*0.5 + torch_randn(n) +y <- y$unsqueeze(dim = 1) ### initialize weights --------------------------------------------------------- # dimensionality of hidden layer -d_hidden <- 32 +d_hidden <- 32 # weights connecting input to hidden layer -w1 <- torch_randn(d_in, d_hidden, requires_grad = TRUE) +w1 <- torch_randn(d_in, d_hidden, requires_grad = TRUE) # weights connecting hidden to output layer -w2 <- torch_randn(d_hidden, d_out, requires_grad = TRUE) +w2 <- torch_randn(d_hidden, d_out, requires_grad = TRUE) # hidden layer bias -b1 <- torch_zeros(1, d_hidden, requires_grad = TRUE) +b1 <- torch_zeros(1, d_hidden, requires_grad = TRUE) # output layer bias -b2 <- torch_zeros(1, d_out,requires_grad = TRUE) +b2 <- torch_zeros(1, d_out,requires_grad = TRUE) ### network parameters --------------------------------------------------------- -learning_rate <- 1e-4 +learning_rate <- 1e-4 ### training loop -------------------------------------------------------------- -for (t in 1:200) { +for (t in 1:200) { ### -------- Forward pass -------- - y_pred <- x$mm(w1)$add(b1)$clamp(min = 0)$mm(w2)$add(b2) + y_pred <- x$mm(w1)$add(b1)$clamp(min = 0)$mm(w2)$add(b2) ### -------- compute loss -------- - loss <- (y_pred - y)$pow(2)$mean() - if (t %% 10 == 0) cat(t, as_array(loss), "\n") + loss <- (y_pred - y)$pow(2)$mean() + if (t %% 10 == 0) cat(t, as_array(loss), "\n") ### -------- Backpropagation -------- # compute the gradient of loss with respect to all tensors with requires_grad = True. - loss$backward() + loss$backward() ### -------- Update weights -------- # Wrap in torch.no_grad() because this is a part we DON'T want to record for automatic gradient computation - with_no_grad({ + with_no_grad({ - w1$sub_(learning_rate * w1$grad) - w2$sub_(learning_rate * w2$grad) - b1$sub_(learning_rate * b1$grad) - b2$sub_(learning_rate * b2$grad) + w1$sub_(learning_rate * w1$grad) + w2$sub_(learning_rate * w2$grad) + b1$sub_(learning_rate * b1$grad) + b2$sub_(learning_rate * b2$grad) # Zero the gradients after every pass, because they'd accumulate otherwise - w1$grad$zero_() - w2$grad$zero_() - b1$grad$zero_() - b2$grad$zero_() + w1$grad$zero_() + w2$grad$zero_() + b1$grad$zero_() + b2$grad$zero_() - }) + }) -} -#> 10 27.60956 -#> 20 25.39985 -#> 30 23.42485 -#> 40 21.65899 -#> 50 20.07844 -#> 60 18.66107 -#> 70 17.38713 -#> 80 16.24375 -#> 90 15.21763 -#> 100 14.29351 -#> 110 13.45975 -#> 120 12.70921 -#> 130 12.03104 -#> 140 11.41835 -#> 150 10.86677 -#> 160 10.36613 -#> 170 9.911062 -#> 180 9.496947 -#> 190 9.121381 -#> 200 8.778724 -
We still manually compute the forward pass, and we still manually update the weights. In the last two chapters of this section, we’ll see how these parts of the logic can be made more modular and reusable, as well.
NEWS.md
- new()(Dev related) Initializes the context. Not user related.
AutogradContext$new( - ptr, - env, - argument_names = NULL, - argument_needs_grad = NULL -)
AutogradContext$new( + ptr, + env, + argument_names = NULL, + argument_needs_grad = NULL +)
Arguments can also be any kind of R object.
AutogradContext$save_for_backward(...)
AutogradContext$save_for_backward(...)
backward(), but it’s always going to be a zero tensor with the same
shape as the shape of a corresponding output.
This is used e.g. for indices returned from a max Function.
AutogradContext$mark_non_differentiable(...)
AutogradContext$mark_non_differentiable(...)
Every tensor that’s been modified in-place in a call to forward() should
be given to this function, to ensure correctness of our checks. It doesn’t
matter whether the function is called before or after modification.
AutogradContext$mark_dirty(...)
AutogradContext$mark_dirty(...)
clone()The objects of this class are cloneable with this method.
AutogradContext$clone(deep = FALSE)
AutogradContext$clone(deep = FALSE)
Converts to array
as_array(x)+
as_array(x)
Binary cross entropy loss |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +BCE with logits loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Bilinear module |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Creates a nn_buffer |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -1945,12 +1989,24 @@ planes. | ConvTranpose3D module |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Cosine embedding loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CrossEntropyLoss module |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +The Connectionist Temporal Classification loss. |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2029,6 +2085,12 @@ planes. | Hardtanh module |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Hinge embedding loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2125,6 +2187,18 @@ planes. | Zeros initialization |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Kullback-Leibler divergence loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +L1 loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2163,6 +2237,12 @@ planes. planes. | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Margin ranking loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2212,12 +2292,60 @@ planes. | Holds submodules in a list. |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +MSE loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Multi margin loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
MultiHead attention |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Multilabel margin loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Multi label soft margin loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Nll loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Pairwise distance |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Creates an |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Poisson NLL loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2266,6 +2394,18 @@ planes. | Sigmoid module |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Smooth L1 loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Soft margin loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2320,6 +2460,18 @@ planes. | Threshoold module |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Triplet margin loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Triplet margin with distance loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2342,6 +2494,24 @@ planes. | Pad a list of variable length Tensors with |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Checks if the object is an nn_module |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Checks if an object is a nn_parameter |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Checks if the object is a nn_buffer |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Triplet_margin_loss |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Triplet margin with distance loss |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| @@ -2956,6 +3132,57 @@ planes. | SGD optimizer |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Checks if the object is a torch optimizer |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+ Learning rate schedulers+ + |
+ ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| + + | +Sets the learning rate of each parameter group to the initial lr +times a given function. When last_epoch=-1, sets initial lr as lr. |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Multiply the learning rate of each parameter group by the factor given +in the specified function. When last_epoch=-1, sets initial lr as lr. |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Once cycle learning rate |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Creates learning rate schedulers |
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| + + | +Step learning rate decay |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| optimizer | +(Optimizer): Wrapped optimizer. |
+
|---|---|
| lr_lambda | +(function or list): A function which computes a multiplicative +factor given an integer parameter epoch, or a list of such +functions, one for each group in optimizer.param_groups. |
+
| last_epoch | +(int): The index of last epoch. Default: -1. |
+
| verbose | +(bool): If |
+
+if (torch_is_installed()) { +# Assuming optimizer has two groups. +lambda1 <- function(epoch) epoch %/% 30 +lambda2 <- function(epoch) 0.95^epoch +if (FALSE) { +scheduler <- lr_lambda(optimizer, lr_lambda = list(lambda1, lambda2)) +for (epoch in 1:100) { + train(...) + validate(...) + scheduler$step() +} +} + +} +
R/optim-lr_scheduler.R
+ lr_multiplicative.RdMultiply the learning rate of each parameter group by the factor given +in the specified function. When last_epoch=-1, sets initial lr as lr.
+lr_multiplicative(optimizer, lr_lambda, last_epoch = -1, verbose = FALSE)+ +
| optimizer | +(Optimizer): Wrapped optimizer. |
+
|---|---|
| lr_lambda | +(function or list): A function which computes a multiplicative +factor given an integer parameter epoch, or a list of such +functions, one for each group in optimizer.param_groups. |
+
| last_epoch | +(int): The index of last epoch. Default: -1. |
+
| verbose | +(bool): If |
+
+if (torch_is_installed()) { +if (FALSE) { +lmbda <- function(epoch) 0.95 +scheduler <- lr_multiplicative(optimizer, lr_lambda=lmbda) +for (epoch in 1:100) { + train(...) + validate(...) + scheduler$step() +} +} + +} +
Sets the learning rate of each parameter group according to the +1cycle learning rate policy. The 1cycle policy anneals the learning +rate from an initial learning rate to some maximum learning rate and then +from that maximum learning rate to some minimum learning rate much lower +than the initial learning rate.
+lr_one_cycle( + optimizer, + max_lr, + total_steps = NULL, + epochs = NULL, + steps_per_epoch = NULL, + pct_start = 0.3, + anneal_strategy = "cos", + cycle_momentum = TRUE, + base_momentum = 0.85, + max_momentum = 0.95, + div_factor = 25, + final_div_factor = 10000, + last_epoch = -1, + verbose = FALSE +)+ +
| optimizer | +(Optimizer): Wrapped optimizer. |
+
|---|---|
| max_lr | +(float or list): Upper learning rate boundaries in the cycle +for each parameter group. |
+
| total_steps | +(int): The total number of steps in the cycle. Note that +if a value is not provided here, then it must be inferred by providing +a value for epochs and steps_per_epoch. +Default: NULL |
+
| epochs | +(int): The number of epochs to train for. This is used along +with steps_per_epoch in order to infer the total number of steps in the cycle +if a value for total_steps is not provided. +Default: NULL |
+
| steps_per_epoch | +(int): The number of steps per epoch to train for. This is +used along with epochs in order to infer the total number of steps in the +cycle if a value for total_steps is not provided. +Default: NULL |
+
| pct_start | +(float): The percentage of the cycle (in number of steps) spent +increasing the learning rate. +Default: 0.3 |
+
| anneal_strategy | +(str): 'cos', 'linear' +Specifies the annealing strategy: "cos" for cosine annealing, "linear" for +linear annealing. +Default: 'cos' |
+
| cycle_momentum | +(bool): If |
+
| base_momentum | +(float or list): Lower momentum boundaries in the cycle +for each parameter group. Note that momentum is cycled inversely +to learning rate; at the peak of a cycle, momentum is +'base_momentum' and learning rate is 'max_lr'. +Default: 0.85 |
+
| max_momentum | +(float or list): Upper momentum boundaries in the cycle +for each parameter group. Functionally, +it defines the cycle amplitude (max_momentum - base_momentum). +Note that momentum is cycled inversely +to learning rate; at the start of a cycle, momentum is 'max_momentum' +and learning rate is 'base_lr' +Default: 0.95 |
+
| div_factor | +(float): Determines the initial learning rate via +initial_lr = max_lr/div_factor +Default: 25 |
+
| final_div_factor | +(float): Determines the minimum learning rate via +min_lr = initial_lr/final_div_factor +Default: 1e4 |
+
| last_epoch | +(int): The index of the last batch. This parameter is used when
+resuming a training job. Since |
+
| verbose | +(bool): If |
+
This policy was initially described in the paper +Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates.
+The 1cycle learning rate policy changes the learning rate after every batch.
+step should be called after a batch has been used for training.
+This scheduler is not chainable.
Note also that the total number of steps in the cycle can be determined in one +of two ways (listed in order of precedence):
A value for total_steps is explicitly provided.
A number of epochs (epochs) and a number of steps per epoch +(steps_per_epoch) are provided.
In this case, the number of total steps is inferred by +total_steps = epochs * steps_per_epoch
+You must either provide a value for total_steps or provide a value for both +epochs and steps_per_epoch.
+ ++if (torch_is_installed()) { +if (FALSE) { +data_loader <- dataloader(...) +optimizer <- optim_sgd(model$parameters, lr=0.1, momentum=0.9) +scheduler <- lr_one_cycle(optimizer, max_lr=0.01, steps_per_epoch=length(data_loader), + epochs=10) + +for (i in 1:epochs) { + for (batch in enumerate(data_loader)) { + train_batch(...) + scheduler$step() + } +} +} + +} +
Creates learning rate schedulers
+lr_scheduler( + classname = NULL, + inherit = LRScheduler, + ..., + parent_env = parent.frame() +)+ +
| classname | +optional name for the learning rate scheduler |
+
|---|---|
| inherit | +an optional learning rate scheduler to inherit from |
+
| ... | +named list of methods. You must implement the |
+
| parent_env | +passed to |
+
Decays the learning rate of each parameter group by gamma every +step_size epochs. Notice that such decay can happen simultaneously with +other changes to the learning rate from outside this scheduler. When +last_epoch=-1, sets initial lr as lr.
+lr_step(optimizer, step_size, gamma = 0.1, last_epoch = -1)+ +
| optimizer | +(Optimizer): Wrapped optimizer. |
+
|---|---|
| step_size | +(int): Period of learning rate decay. |
+
| gamma | +(float): Multiplicative factor of learning rate decay. +Default: 0.1. |
+
| last_epoch | +(int): The index of last epoch. Default: -1. |
+
+if (torch_is_installed()) { +if (FALSE) { +# Assuming optimizer uses lr = 0.05 for all groups +# lr = 0.05 if epoch < 30 +# lr = 0.005 if 30 <= epoch < 60 +# lr = 0.0005 if 60 <= epoch < 90 +# ... +scheduler <- lr_step(optimizer, step_size=30, gamma=0.1) +for (epoch in 1:100) { + train(...) + validate(...) + scheduler$step() +} +} + +} +
nn_adaptive_avg_pool1d(output_size)+
nn_adaptive_avg_pool1d(output_size)
| reduction | +(string, optional): Specifies the reduction to apply to the output:
+ |
+
|---|
As with nn_nll_loss(), the input given is expected to contain
+log-probabilities and is not restricted to a 2D Tensor.
The targets are interpreted as probabilities by default, but could be considered
+as log-probabilities with log_target set to TRUE.
This criterion expects a target Tensor of the same size as the
+input Tensor.
The unreduced (i.e. with reduction set to 'none') loss can be described
+as:
$$ + l(x,y) = L = \{ l_1,\dots,l_N \}, \quad +l_n = y_n \cdot \left( \log y_n - x_n \right) +$$
+where the index \(N\) spans all dimensions of input and \(L\) has the same
+shape as input. If reduction is not 'none' (default 'mean'), then:
$$ + \ell(x, y) = \begin{array}{ll} +\mbox{mean}(L), & \mbox{if reduction} = \mbox{'mean';} \\ +\mbox{sum}(L), & \mbox{if reduction} = \mbox{'sum'.} +\end{array} +$$
+In default reduction mode 'mean', the losses are averaged for each minibatch
+over observations as well as over dimensions. 'batchmean' mode gives the
+correct KL divergence where losses are averaged over batch dimension only.
+'mean' mode's behavior will be changed to the same as 'batchmean' in the next
+major release.
reduction = 'mean' doesn't return the true kl divergence value,
+please use reduction = 'batchmean' which aligns with KL math
+definition.
+In the next major release, 'mean' will be changed to be the same as
+'batchmean'.
Input: \((N, *)\) where \(*\) means, any number of additional +dimensions
Target: \((N, *)\), same shape as the input
Output: scalar by default. If reduction is 'none', then \((N, *)\),
+the same shape as the input
Creates a criterion that measures the mean absolute error (MAE) between each +element in the input \(x\) and target \(y\).
+nn_l1_loss(reduction = "mean")+ +
| reduction | +(string, optional): Specifies the reduction to apply to the output:
+ |
+
|---|
The unreduced (i.e. with reduction set to 'none') loss can be described
+as:
$$ +\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad +l_n = \left| x_n - y_n \right|, +$$
+where \(N\) is the batch size. If reduction is not 'none'
+(default 'mean'), then:
$$ +\ell(x, y) = +\begin{array}{ll} +\mbox{mean}(L), & \mbox{if reduction} = \mbox{'mean';}\\ +\mbox{sum}(L), & \mbox{if reduction} = \mbox{'sum'.} +\end{array} +$$
+\(x\) and \(y\) are tensors of arbitrary shapes with a total +of \(n\) elements each.
+The sum operation still operates over all the elements, and divides by \(n\).
+The division by \(n\) can be avoided if one sets reduction = 'sum'.
Input: \((N, *)\) where \(*\) means, any number of additional +dimensions
Target: \((N, *)\), same shape as the input
Output: scalar. If reduction is 'none', then
+\((N, *)\), same shape as the input
+if (torch_is_installed()) { +loss <- nn_l1_loss() +input <- torch_randn(3, 5, requires_grad=TRUE) +target <- torch_randn(3, 5) +output <- loss(input, target) +output$backward() + +} +
Applies the element-wise function:
nn_leaky_relu(negative_slope = 0.01, inplace = FALSE)+
nn_leaky_relu(negative_slope = 0.01, inplace = FALSE)
| parent_env | -passed to |
+ passed to |
|---|
if (torch_is_installed()) { -model <- nn_module( - initialize = function() { - self$conv1 <- nn_conv2d(1, 20, 5) - self$conv2 <- nn_conv2d(20, 20, 5) - }, - forward = function(input) { - input <- self$conv1(input) - input <- nnf_relu(input) - input <- self$conv2(input) - input <- nnf_relu(input) - input - } -) - -} +@@ -147,6 +159,9 @@ nn_module methods." />if (torch_is_installed()) { +model <- nn_module( + initialize = function() { + self$conv1 <- nn_conv2d(1, 20, 5) + self$conv2 <- nn_conv2d(20, 20, 5) + }, + forward = function(input) { + input <- self$conv1(input) + input <- nnf_relu(input) + input <- self$conv2(input) + input <- nnf_relu(input) + input + } +) + +}
nn_module methods.
- nn_module_list(modules = list())+
nn_module_list(modules = list())
| anchor | +the anchor input tensor |
+
|---|---|
| positive | +the positive input tensor |
+
| negative | +the negative input tensor |
+
| distance_function | +(callable, optional): A nonnegative, real-valued function that
+quantifies the closeness of two tensors. If not specified,
+ |
+
| margin | +Default: 1. |
+
| swap | +The distance swap is described in detail in the paper Learning shallow
+convolutional feature descriptors with triplet losses by V. Balntas, E. Riba et al.
+Default: |
+
| reduction | +(string, optional) – Specifies the reduction to apply to the +output: 'none' | 'mean' | 'sum'. 'none': no reduction will be applied, 'mean': +the sum of the output will be divided by the number of elements in the output, +'sum': the output will be summed. Default: 'mean' |
+
Extracts sliding local blocks from an batched input tensor.
-nnf_unfold(input, kernel_size, dilation = 1, padding = 0, stride = 1)+
nnf_unfold(input, kernel_size, dilation = 1, padding = 0, stride = 1)