TensorBoard for Grads and Individual Weights
I needed to record gradients during training and individual weights, instead of tensor means, using tensorboard, but the former is not available anymore by default, and the latter never was. So I updated the tf2 callback, and since probably some of you might find it useful you can find it here.
I called it ExtendedTensorBoard and the definition is quite simple:
class ExtendedTensorBoard(tf.keras.callbacks.TensorBoard):
    def __init__(self, validation_data, n_individual_weight_samples=3, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # here we use test data to calculate the gradients
        self._x_batch = validation_data[0]
        self._y_batch = validation_data[1] if len(validation_data) == 2 else None
        self.n_individual_weight_samples = n_individual_weight_samples
    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs=logs)
        if self.histogram_freq and epoch % self.histogram_freq == 0:
            _log_grads(self, epoch)
    def _log_weights(self, epoch):
        _log_weights_individual(self, epoch)
I took as a starting point for the grads logging this excellent medium post and I extended it to handle any model definition, with general inputs (multi-input case, no output case) and general losses (auxiliary losses and no output loss cases). The final gradient logging looks like follows:
def _log_grads(self, epoch):
    with tf.GradientTape(persistent=True) as tape:
        # This capture current state of weights
        tape.watch(self.model.trainable_weights)
        # Calculate loss for given current state of weights
        _y_pred = self.model(self._x_batch)
        loss = self.model.compiled_loss(
            y_true=self._y_batch, y_pred=_y_pred, sample_weight=None, regularization_losses=self.model.losses
        )
    # Calculate grads wrt current weights
    grads = [tape.gradient(loss, l.trainable_weights) for l in self.model.layers]
    names = [l.name for l in self.model.layers]
    del tape
    with self._train_writer.as_default():
        with summary_ops_v2.always_record_summaries():
            for g, n in zip(grads, names):
                if len(g) > 0:
                    for i, curr_grad in enumerate(g):
                        if len(curr_grad) > 0:
                            nc = 'bias' if len(curr_grad.shape) == 1 else 'weight'
                            mean = tf.reduce_mean(tf.abs(curr_grad))
                            summary_ops_v2.scalar('grad_mean_{}_{}_{}'.format(n, i + 1, nc), mean, step=epoch)
                            summary_ops_v2.histogram('grad_histogram_{}_{}_{}'.format(n, i + 1, nc), curr_grad,
                                                     step=epoch)
    self._train_writer.flush()
Then the logging of individual weights was motivated by the fact that some times I had mean and standard deviations of the distribution of weights not changing during training even though the task seemed to be solved successfully. So I decided to log individual weights to make sure, they were changing even if the distribution was not. Here the function that handles the logging of individual weights:
def _log_weights_individual(self, epoch):
    """Logs the weights of the Model to TensorBoard."""
    if epoch == 0:
        self.dict_scalar_locations = {}
    with self._train_writer.as_default():
        with summary_ops_v2.always_record_summaries():
            for layer in self.model.layers:
                for weight in layer.weights:
                    weight_name = weight.name.replace(':', '_')
                    summary_ops_v2.histogram(weight_name, weight, step=epoch)
                    # what preceeds is the standard Tensorboard behavior while the lines that follow
                    # record some of the weights individually
                    for i in range(self.n_individual_weight_samples):
                        scalar_name = '{}_{}'.format(weight.name.replace(':', '_'), i)
                        if epoch == 0:
                            c = [np.random.choice(ax) for ax in weight.shape]
                            self.dict_scalar_locations[scalar_name] = c
                        else:
                            c = self.dict_scalar_locations[scalar_name]
                        summary_ops_v2.scalar(scalar_name, weight[c], step=epoch)
                    if self.write_images:
                        self._log_weight_as_image(weight, weight_name, epoch)
            self._train_writer.flush()
Let me know if it works for you or if it can be generalized to more use cases. For now it worked for all use cases I tried, image and language, with and without final loss and auxiliary losses, and multiple inputs. Let me know if you have any suggestion to improve it as well!