Merge pull request #23 from joelmathewthomas/feature/asteroid-wrapper

Feature/asteroid wrapper
2025-01-29 00:51:33 +05:30
parent 8e3e7dd062 f2e41dc7ef
commit e7b142504c
6 changed files with 162 additions and 0 deletions
@@ -1,7 +1,14 @@
 aiohappyeyeballs==2.4.4
 aiohttp==3.11.11
 aiosignal==1.3.2
 amqp==5.3.1
 antlr4-python3-runtime==4.9.3
 asteroid==0.7.0
 asteroid-filterbanks==0.4.0
 attrs==25.1.0
 audioread==3.0.1
 billiard==4.2.1
 cached-property==2.0.1
 celery==5.4.0
 certifi==2024.12.14
 cffi==1.17.1
@@ -19,7 +26,10 @@ dora_search==0.1.12
 einops==0.8.0
 filelock==3.17.0
 fonttools==4.55.6
 frozenlist==1.5.0
 fsspec==2024.12.0
 future==1.0.0
 huggingface-hub==0.28.0
 idna==3.10
 iniconfig==2.0.0
 Jinja2==3.1.5
@@ -30,11 +40,14 @@ kombu==5.4.2
 lameenc==1.8.1
 lazy_loader==0.4
 librosa==0.10.2.post1
 lightning-utilities==0.11.9
 llvmlite==0.44.0
 MarkupSafe==3.0.2
 matplotlib==3.10.0
 mir_eval==0.7
 mpmath==1.3.0
 msgpack==1.1.0
 multidict==6.1.0
 networkx==3.4.2
 numba==0.61.0
 numpy==2.1.3
@@ -53,15 +66,23 @@ nvidia-nvtx-cu12==12.4.127
 omegaconf==2.3.0
 openunmix==1.3.0
 packaging==24.2
 pandas==2.2.3
 pb-bss-eval==0.0.2
 pesq==0.0.4
 pillow==11.1.0
 platformdirs==4.3.6
 pluggy==1.5.0
 pooch==1.8.2
 prompt_toolkit==3.0.50
 propcache==0.2.1
 pycparser==2.22
 pyparsing==3.2.1
 pystoi==0.4.1
 pytest==8.3.4
 python-dateutil==2.9.0.post0
 pytorch-lightning==2.5.0.post0
 pytorch-ranger==0.1.1
 pytz==2024.2
 PyYAML==6.0.2
 redis==5.2.1
 requests==2.32.3
@@ -76,7 +97,10 @@ submitit==1.5.2
 sympy==1.13.1
 threadpoolctl==3.5.0
 torch==2.5.1
 torch-optimizer==0.1.0
 torch-stoi==0.2.3
 torchaudio==2.5.1
 torchmetrics==0.11.4
 tqdm==4.67.1
 treetable==0.2.5
 triton==3.1.0
@@ -85,3 +109,4 @@ tzdata==2025.1
 urllib3==2.3.0
 vine==5.1.0
 wcwidth==0.2.13
 yarl==1.18.3
@@ -0,0 +1,12 @@
 # __init__.py
 import logging
 from datetime import datetime
 # Configure logging
 logging.basicConfig(
    format='%(asctime)s : %(message)s',
    level = logging.INFO
 )
 logging.info("freq-split-enhance/postprocessing package has been imported.")
@@ -0,0 +1,36 @@
 import soundfile as sf
 import numpy as np
 def export_audio(audio, output_path, sr):
    """
    Save a NumPy audio array to a specified audio file.
    Args:
        audio (numpy.ndarray): The audio data to be saved.`
        output_path (str): The path where the audio file should be saved.
        sr (int): The sampling rate of the audio.
    """
    try:
        print(f"Initial audio shape: {audio.shape}, dtype: {audio.dtype}")
        if audio.ndim == 2 and audio.shape[0] == 2:
            # Transpose stereo audio to match the expected shape
            audio = audio.T  # From (2, num_samples) to (num_samples, 2)
        # Ensure the audio data type is float32
        audio = audio.astype('float32')
        # Normalize audio to avoid distortion
        if np.max(np.abs(audio)) > 0:  # Avoid divide by zero
            audio = audio / np.max(np.abs(audio))
        # Verify final format
        print(f"Final audio shape: {audio.shape}, dtype: {audio.dtype}, max: {np.max(audio)}, min: {np.min(audio)}")
        sf.write(output_path, audio, sr, format='wav')
        print(f"Audio saved to {output_path}")
    except Exception as e:
        print(f"Error saving audio: {e}")
@@ -0,0 +1,39 @@
 import torch
 from asteroid.models import ConvTasNet
 def separate(audio, model_name='mpariente/ConvTasNet_WHAM!_sepclean'):
    """
    Separates audio into sources using a pretrained Asteroid model.
    Args:
        audio (numpy.ndarray): The audio time series (1D numpy array).
        model_name (str): Name of the pretrained model from Asteroid. Default is 'mpariente/ConvTasNet_WHAM!_sepclean'.
    Returns:
        list: List of separated sources as numpy arrays.
    """
    try:
        # Select the device: GPU if available, otherwise CPU
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")
        # Load the pretrained model and move it to the selected device
        model = ConvTasNet.from_pretrained(model_name).to(device)
        # Convert the audio array to a PyTorch tensor, add batch dimension, and move to device
        audio_tensor = torch.tensor(audio).unsqueeze(0).to(device)  # Shape: (1, num_samples)
        # Perform source separation
        with torch.no_grad():
            separated_sources = model(audio_tensor)
        # Remove batch dimension
        separated_sources = separated_sources.squeeze(0)  # Shape: (num_sources, num_samples)
        # Split into list of sources
        separated_sources_np = separated_sources.cpu().numpy()  # Convert to NumPy
        separated_sources_list = [separated_sources_np[i, :] for i in range(separated_sources_np.shape[0])]
        return separated_sources_list
    except Exception as e:
        raise RuntimeError(f"Error during separation: {e}")
@@ -7,6 +7,8 @@ from src.input.file_reader import read_audio
 from src.preprocessing.trim import trim_audio
 from src.preprocessing.resample import resample
 from src.separation.demucs_wrapper import separate_audio_with_demucs
 from src.separation.convtasnet_wrapper import separate
 from src.postprocessing.audio_writer import export_audio
 def test_demucs_separation_with_preprocessing():
@@ -51,3 +53,51 @@ def test_demucs_separation_with_preprocessing():
    for expected_file in expected_files:
        file_path = file_folder / expected_file
        assert file_path.exists(), f"Expected file {expected_file} not found in {file_name} folder."
 def test_convtasnet_separation_with_output_files():
    """
    Test to ensure ConvTasNet separation creates expected source audio files.
    """
    input_audio_path = "tests/test_audio/female-female-mixture.wav"
    output_dir = "/tmp/convtasnet"
    model_name = "mpariente/ConvTasNet_WHAM!_sepclean"
    audio, sr = read_audio(input_audio_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    separated_sources = separate(audio, model_name)
    for i, source in enumerate(separated_sources):
        output_path = os.path.join(output_dir, f"source_{i+1}.wav")
        export_audio(source, output_path, sr)
    # Check if the output directory exists
    assert os.path.exists(output_dir), "Output directory does not exist."
    # Check if source_1.wav and source_2.wav are created
    source_1_path = os.path.join(output_dir, "source_1.wav")
    source_2_path = os.path.join(output_dir, "source_2.wav")
    assert os.path.exists(source_1_path), "source_1.wav was not created."
    assert os.path.exists(source_2_path), "source_2.wav was not created."
    # Check if the files have content (not just created)
    # For example, you can check if the length of the audio files is greater than a certain threshold
    # Here, we'll just verify the files are not empty.
    import soundfile as sf
    def is_file_non_empty(file_path):
        try:
            data, _ = sf.read(file_path)
            return data.size > 0
        except Exception as e:
            return False
    assert is_file_non_empty(source_1_path), "source_1.wav is empty."
    assert is_file_non_empty(source_2_path), "source_2.wav is empty."
    print("Test passed: source_1.wav and source_2.wav are present and non-empty.")