def __call__(self, data):
pos = data.pos
if self.max_points > 0 and pos.size(0) > self.max_points:
perm = torch.randperm(pos.size(0))
pos = pos[perm[:self.max_points]]
pos = pos - pos.mean(dim=0, keepdim=True)
C = torch.matmul(pos.t(), pos)
e, v = torch.eig(C, eigenvectors=True) # v[:,j] is j-th eigenvector
data.pos = torch.matmul(data.pos, v)
if 'norm' in data:
data.norm = F.normalize(torch.matmul(data.norm, v))
return data
def complex_whiten(complex_image, eps=1e-10):
real = complex_image[:, :, 0]
imag = complex_image[:, :, 1]
# Center around mean.
centered_complex_image = complex_image - complex_image.mean()
# Determine covariance between real and imaginary.
n = real.nelement()
real_real = (real.mul(real).sum() - real.mean().mul(real.mean())) / n
real_imag = (real.mul(imag).sum() - real.mean().mul(imag.mean())) / n
imag_imag = (imag.mul(imag).sum() - imag.mean().mul(imag.mean())) / n
V = torch.Tensor([[real_real, real_imag], [real_imag, imag_imag]])
# Remove correlation by rotating around covariance eigenvectors.
eig_values, eig_vecs = torch.eig(V, eigenvectors=True)
whitened_image = torch.matmul(centered_complex_image, eig_vecs)
# Scale by eigenvalues for unit variance.
whitened_image[:, :, 0] = whitened_image[:, :, 0] / (eig_values[0, 0] + eps).sqrt()
whitened_image[:, :, 1] = whitened_image[:, :, 1] / (eig_values[1, 0] + eps).sqrt()
return whitened_image
# Helper functions
def get_singular_gaussian_penalty(model):
"""Return scalar high when attention covariance get very singular
if config["attention_isotropic_gaussian"]:
# TODO move at setup
print("Singular gaussian penalty ignored as `attention_isotropic_gaussian` is True")
return 0
condition_numbers = []
for layer in model.encoder.layer:
for sigma_half_inv in layer.attention.self.attention_spreads:
sigma_inv = sigma_half_inv.transpose(0, 1) @ sigma_half_inv
eig_values = torch.eig(sigma_inv)[0][:, 0].abs()
condition_number = eig_values.max() / eig_values.min()
return torch.mean((torch.tensor(condition_numbers) - 1) ** 2)
def expm_eig(A):
eigen_values, eigen_vector = th.eig(A, eigenvectors=True)
return, th.diag(eigen_values[:, 0])), eigen_vector.t_())
def _spectral_embedding(self, X):
Helper function to embed the dataset X into the eigenvectors of the graph Laplacian matrix
ht.DNDarray, shape=(m_lanczos):
Eigenvalues of the graph's Laplacian matrix.
ht.DNDarray, shape=(n, m_lanczos):
Eigenvectors of the graph's Laplacian matrix.
L = self._laplacian.construct(X)
# 3. Eigenvalue and -vector calculation via Lanczos Algorithm
v0 = ht.full(
fill_value=1.0 / math.sqrt(L.shape[0]),
V, T = ht.lanczos(L, self.n_lanczos, v0)
# 4. Calculate and Sort Eigenvalues and Eigenvectors of tridiagonal matrix T
eval, evec = torch.eig(T._DNDarray__array, eigenvectors=True)
# If x is an Eigenvector of T, then y = V@x is the corresponding Eigenvector of L
eval, idx = torch.sort(eval[:, 0], dim=0)
eigenvalues = ht.array(eval)
eigenvectors = ht.matmul(V, ht.array(evec))[:, idx]
return eigenvalues, eigenvectors
def reset_parameters(self):
weight_dict = self.state_dict()
for key, value in weight_dict.items():
if key == 'weight_ih_l0':
nn.init.uniform_(value, -1, 1)
value *= self.w_ih_scale[1:]
elif re.fullmatch('weight_ih_l[^0]*', key):
nn.init.uniform_(value, -1, 1)
elif re.fullmatch('bias_ih_l[0-9]*', key):
nn.init.uniform_(value, -1, 1)
value *= self.w_ih_scale[0]
elif re.fullmatch('weight_hh_l[0-9]*', key):
w_hh = torch.Tensor(self.hidden_size * self.hidden_size)
w_hh.uniform_(-1, 1)
if self.density < 1:
zero_weights = torch.randperm(
int(self.hidden_size * self.hidden_size))
zero_weights = zero_weights[
self.hidden_size * self.hidden_size * (
1 - self.density))]
w_hh[zero_weights] = 0
w_hh = w_hh.view(self.hidden_size, self.hidden_size)
abs_eigs = (torch.eig(w_hh)[0] ** 2).sum(1).sqrt()
weight_dict[key] = w_hh * (self.spectral_radius / torch.max(abs_eigs))
def spectral_radius(m):
Compute spectral radius of a square 2-D tensor
:param m: squared 2D tensor
return torch.max(torch.abs(torch.eig(m)[0])).item()
# end spectral_radius
# Compute spectral radius of a square 2-D tensor for stacked-ESN
def __ge__(self, other):
Greater or equal
:param other:
# Compute eigenvalues of a - b
eig_v = torch.eig(other.get_C() - self.w_out, eigenvectors=False)
return float(torch.max(eig_v)) >= 0.0
# end __ge__
# Greater
def __gt__(self, other):
:param other:
# Compute eigenvalues of a - b
eig_v = torch.eig(other.get_C() - self.w_out, eigenvectors=False)
return float(torch.max(eig_v)) > 0.0
# end __gt__
# Less
def forward(self, adj, feat, lambda_max=None):
r"""Compute (Dense) Chebyshev Spectral Graph Convolution layer.
adj : torch.Tensor
The adjacency matrix of the graph to apply Graph Convolution on,
should be of shape :math:`(N, N)`, where a row represents the destination
and a column represents the source.
feat : torch.Tensor
The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}`
is size of input feature, :math:`N` is the number of nodes.
lambda_max : float or None, optional
A float value indicates the largest eigenvalue of given graph.
Default: None.
The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
is size of output feature.
A =
num_nodes = A.shape[0]
in_degree = 1 / A.sum(dim=1).clamp(min=1).sqrt()
D_invsqrt = th.diag(in_degree)
I = th.eye(num_nodes).to(A)
L = I - D_invsqrt @ A @ D_invsqrt
if lambda_max is None:
lambda_ = th.eig(L)[0][:, 0]
lambda_max = lambda_.max()
L_hat = 2 * L / lambda_max - I
Z = [th.eye(num_nodes).to(A)]
for i in range(1, self._k):
if i == 1:
Z.append(2 * L_hat @ Z[-1] - Z[-2])
Zs = th.stack(Z, 0) # (k, n, n)
Zh = (Zs @ feat.unsqueeze(0) @ self.W)
Zh = Zh.sum(0)
if self.bias is not None:
Zh = Zh + self.bias
return Zh
def find_degenerated_heads(model):
returns a dict of degenerated head per layer like {layer_idx -> [head_idx, ...]}
model_params = dict(model.named_parameters())
degenerated_heads = OrderedDict()
degenerated_reasons = []
for layer_idx in range(config["num_hidden_layers"]):
prune_heads = []
sigmas_half_inv = model_params["encoder.layer.{}.attention.self.attention_spreads".format(layer_idx)]
for head_idx in range(config["num_attention_heads"]):
head_is_degenerated = False
if config["attention_isotropic_gaussian"]:
sigma_inv = sigmas_half_inv[head_idx]
if sigma_inv ** 2 < 1e-5:
degenerated_reasons.append("Sigma too low -> uniform attention: sigma**-2= {}".format(sigma_inv ** 2))
head_is_degenerated = True
sigma_half_inv = sigmas_half_inv[head_idx]
sigma_inv = sigma_half_inv.transpose(0, 1) @ sigma_half_inv
eig_values = torch.eig(sigma_inv)[0][:, 0].abs()
condition_number = eig_values.max() / eig_values.min()
if condition_number > 1000:
degenerated_reasons.append("Covariance matrix is ill defined: condition number = {}".format(condition_number))
head_is_degenerated = True
elif eig_values.max() < 1e-5:
degenerated_reasons.append("Covariance matrix is close to 0: largest eigen value = {}".format(eig_values.max()))
head_is_degenerated = True
if head_is_degenerated:
if prune_heads:
degenerated_heads[layer_idx] = prune_heads
if degenerated_heads:
print("Degenerated heads:")
reasons = iter(degenerated_reasons)
table = [(layer, head, next(reasons)) for layer, heads in degenerated_heads.items() for head in heads]
print(tabulate.tabulate(table, headers=["layer", "head", "reason"]))
return degenerated_heads