diff --git a/dwave/plugins/torch/models/boltzmann_machine.py b/dwave/plugins/torch/models/boltzmann_machine.py
index ff8739f..00acb28 100644
--- a/dwave/plugins/torch/models/boltzmann_machine.py
+++ b/dwave/plugins/torch/models/boltzmann_machine.py
@@ -49,6 +49,13 @@
 class GraphRestrictedBoltzmannMachine(torch.nn.Module):
     """Creates a graph-restricted Boltzmann machine.
 
+    The initialization-strategy is grounded in  
+    `Hinton's practical guide for RBM training<https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf>`_, which recommends sampling 
+    weights from a Gaussian distribution with mean 0 and standard deviation 0.01 (for zero-one-valued RBMs). 
+    The scaling factor of :math:`1/\sqrt(N)` ensures that the energy functional remains extensive 
+    and initializes the GRBM in a paramagnetic regime, consistent with the `Sherrington-Kirkpatrick model<https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.35.1792>`_.
+    The biases are initialized to zero to ensure extensiveness of the energy functional and to avoid introducing any initial preference for spin configurations.
+
     Args:
         nodes (Iterable[Hashable]): List of nodes.
         edges (Iterable[tuple[Hashable, Hashable]]): List of edges.
@@ -82,8 +89,8 @@ def __init__(
         self._idx_to_edge = {i: e for i, e in enumerate(self._edges)}
         self._edge_to_idx = {e: i for i, e in self._idx_to_edge.items()}
 
-        self._linear = torch.nn.Parameter(0.05 * (2 * torch.rand(self._n_nodes) - 1))
-        self._quadratic = torch.nn.Parameter(5.0 * (2 * torch.rand(self._n_edges) - 1))
+        self._linear = torch.nn.Parameter(torch.zeros(self._n_nodes))
+        self._quadratic = torch.nn.Parameter(torch.randn(self._n_edges)/self._n_nodes**0.5)
 
         edge_idx_i = torch.tensor([self._node_to_idx[i] for i, _ in self._edges])
         edge_idx_j = torch.tensor([self._node_to_idx[j] for _, j in self._edges])
diff --git a/releasenotes/notes/gaussian-rbm-init-28fd4d295ef86d77.yaml b/releasenotes/notes/gaussian-rbm-init-28fd4d295ef86d77.yaml
new file mode 100644
index 0000000..ea450d5
--- /dev/null
+++ b/releasenotes/notes/gaussian-rbm-init-28fd4d295ef86d77.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Initialize ``GraphRestrictedBoltzmannMachine`` weights using Gaussian 
+    random variables with standard deviation equal to :math:`1/\sqrt(N)`, where N 
+    denotes the number of nodes in the GRBM. The weight-initialization strategy is grounded in `Hinton's practical guide for RBM training <https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf>`_, which recommends sampling weights from a Gaussian distribution with mean 0 and standard deviation 0.01 (for zero-one-valued RBMs). The scaling factor of :math:`1/\sqrt(N)` ensures that the energy functional remains extensive and initializes the GRBM in a paramagnetic regime, consistent with the `Sherrington-Kirkpatrick model<https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.35.1792>`_.
+
+
diff --git a/tests/test_dvae_winci2020.py b/tests/test_dvae_winci2020.py
index 38dfff7..e22cd39 100644
--- a/tests/test_dvae_winci2020.py
+++ b/tests/test_dvae_winci2020.py
@@ -78,12 +78,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # are the models themselves
         latent_dims_list = [1, 2]
         self.encoders = {i: Encoder(i) for i in latent_dims_list}
-        # self.decoders is independent of number of latent dims, but we also create a dict to separate
-        # them
+        # self.decoders is independent of number of latent dims, but we also create a dict to
+        # separate them
         self.decoders = {i: Decoder(latent_features, input_features) for i in latent_dims_list}
-
-        # self.dvaes is a dict whose keys are the numbers of latent dims and the values are the models
-        # themselves
+        # self.dvaes is a dict whose keys are the numbers of latent dims and the values are the
+        # models themselves
 
         self.dvaes = {i: DVAE(self.encoders[i], self.decoders[i]) for i in latent_dims_list}
 
@@ -248,19 +247,22 @@ def test_latent_to_discrete(self, n_samples, expected):
     @parameterized.expand([(i, j) for i in range(1, 3) for j in [0, 1, 5, 1000]])
     def test_forward(self, n_latent_dims, n_samples):
         """Test the forward method."""
+        torch.manual_seed(1234)  # Set seed for reproducibility of latent_to_discrete sampling
         expected_latents = self.encoders[n_latent_dims](self.data)
         expected_discretes = self.dvaes[n_latent_dims].latent_to_discrete(
             expected_latents, n_samples
         )
         expected_reconstructed_x = self.decoders[n_latent_dims](expected_discretes)
 
+        torch.manual_seed(1234)  # Set seed again to ensure that the sampling in the forward method
+        # is the same as in the expected_discretes
         latents, discretes, reconstructed_x = self.dvaes[n_latent_dims].forward(
             x=self.data, n_samples=n_samples
         )
+        torch.testing.assert_close(latents, expected_latents)
+        torch.testing.assert_close(discretes, expected_discretes)
+        torch.testing.assert_close(reconstructed_x, expected_reconstructed_x)
 
-        assert torch.equal(reconstructed_x, expected_reconstructed_x)
-        assert torch.equal(discretes, expected_discretes)
-        assert torch.equal(latents, expected_latents)
 
 
 if __name__ == "__main__":