Tensorflowで音声処理

TensorflowでMelSpectrogram

tf.signal.stftはTFLiteで使えない。

import numpy as np
import librosa
import librosa.display
import cv2

import tensorflow as tf

def MinMaxNorm(tensor):
    minval = tf.reduce_min(tensor, axis=[1,2])
    maxval = tf.reduce_max(tensor, axis=[1,2])
    
    if True:
        maxval = tf.maximum(2.0, maxval)

    tensor = tf.divide(tensor - minval, maxval - minval + 1e-8)
    return tensor


def ToMelTF0(y, sr, DIM=128, TFLITE=False):
    batch_size, num_samples, sample_rate = 1, y.shape[1], sr

    if not TFLITE:
        # A 1024-point STFT with frames of 64 ms and 75% overlap.
        stfts = tf.signal.stft(y, frame_length=1024, frame_step=256,
                            fft_length=1024, pad_end=False)

        spectrograms = tf.abs(stfts)
    else:
        y = tf.reshape(y, [1, int(y.shape[1])])
        spectrograms, stfts = stft_magnitude_tflite(y, window_length_samples=1024,hop_length_samples=256,fft_length=1024)
        
    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins = stfts.shape[-1]#.value
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, DIM
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
    num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
    upper_edge_hertz)
    mel_spectrograms = tf.tensordot(
    spectrograms, linear_to_mel_weight_matrix, 1)

    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
    linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    log_mel_spectrograms = MinMaxNorm(log_mel_spectrograms)
    
    return log_mel_spectrograms


def ToMelTF(filename, sr=16000):

    y, sr = librosa.load(filename, sr=sr, duration=100000)
    log_mel_spectrograms =ToMelTF0(y[np.newaxis], sr)
    res = log_mel_spectrograms.numpy()[0]
    return res
前へ
次へ