Implement audio classification function using YAMNet

- Create a function to load audio, resample, and classify using YAMNet - Ensure compatibility with different audio formats and sample rates - Normalize audio and classify it into one of 600 categories
2024-12-26 00:58:53 +05:30
parent cbebf7bd93
commit e4abb070db
2 changed files with 25 additions and 20 deletions
@@ -56,6 +56,7 @@ python-dateutil==2.9.0.post0
 requests==2.32.3
 rich==13.9.4
 scikit-learn==1.6.0
 scipy==1.14.1
 setuptools==75.6.0
 six==1.17.0
 soundfile==0.12.1
@@ -14,31 +14,35 @@ def class_names_from_csv(class_map_scv_text):
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names
-class_map_path = model.class_map_path().numpy()
+# Main function to process audio and classify
-class_names = class_names_from_csv(class_map_path)
+def classify_audio(file_path):
    """
    Given an audio file, this function loads the audio, resamples it, 
    normalizes it, and runs it through the YAMNet model to classify the sound.
-wav_file_name = 'cafe_crowd_talk.wav'
+    Args:
-waveform, sample_rate = librosa.load(wav_file_name, sr=16000)
+    - file_path (str): Path to the audio file (WAV, MP3, etc.).
-# Show some basic information about the audio.
+    Returns:
-duration = len(waveform)/sample_rate
+    - str: Predicted class label of the audio.
-print(f'Sample rate: {sample_rate} Hz')
+    """
-print(f'Total duration: {duration:.2f}s')
+    # Load audio using librosa (this handles both loading, resampling, and conversion to mono)
-print(f'Size of the input: {len(waveform)}')
+    waveform, sample_rate = librosa.load(file_path, sr=16000, mono=True)  # Ensuring 16k sample rate and mono
-# The waveform needs to be normalized to values in [-1.0, 1.0] (librosa load already does this)
+    # Normalize the waveform to [-1.0, 1.0] (librosa already returns normalized values)
-# No need to do this as librosa already normalizes# The wav_data needs to be normalized to values in [-1.0, 1.0]
+    waveform = waveform / np.max(np.abs(waveform))
-# Execute the Model
+    # Execute the YAMNet model
-# Check the output.
+    scores, embeddings, spectrogram = model(waveform)
 scores, embeddings, spectogram = model(waveform)
 scores_np = scores.numpy()
 spectogram_np = spectogram.numpy()
 infered_class = class_names[scores_np.mean(axis=0).argmax()]
 print(f'The main sound is : {infered_class}')
    # Extract the class names from the model
    class_map_path = model.class_map_path().numpy()
    class_names = class_names_from_csv(class_map_path)
    # Find the class with the highest score
    scores_np = scores.numpy()
    inferred_class = class_names[scores_np.mean(axis=0).argmax()]
    return inferred_class