Format PicoAudio and move into tta bin (#378)

* edit picoaudio into tta category * black format picoaudio
open-mmlab · Jan 2, 2025 · 04dfe6e · 04dfe6e
1 parent c0b0df0
commit 04dfe6e
Show file tree

Hide file tree

Showing 99 changed files with 922 additions and 12,964 deletions.
diff --git a/...ally_controllable_tta/data/meta_data/test-frequency-control_onoffFromGpt_multi-event.json b/...ally_controllable_tta/data/meta_data/test-frequency-control_onoffFromGpt_multi-event.json
diff --git a/...lly_controllable_tta/data/meta_data/test-frequency-control_onoffFromGpt_single-event.json b/...lly_controllable_tta/data/meta_data/test-frequency-control_onoffFromGpt_single-event.json
diff --git a/models/temporally_controllable_tta/data/meta_data/test-onoff-control_multi-event.json b/models/temporally_controllable_tta/data/meta_data/test-onoff-control_multi-event.json
diff --git a/models/temporally_controllable_tta/data/meta_data/test-onoff-control_single-event.json b/models/temporally_controllable_tta/data/meta_data/test-onoff-control_single-event.json
diff --git a/models/temporally_controllable_tta/data/meta_data/train.json b/models/temporally_controllable_tta/data/meta_data/train.json
diff --git a/models/temporally_controllable_tta/picoaudio/audioldm/clap/training/audioset_textmap.npy b/models/temporally_controllable_tta/picoaudio/audioldm/clap/training/audioset_textmap.npy
diff --git a/models/temporally_controllable_tta/picoaudio/audioldm/variational_autoencoder/__init__.py b/models/temporally_controllable_tta/picoaudio/audioldm/variational_autoencoder/__init__.py
diff --git a/models/temporally_controllable_tta/picoaudio/data/filter_data.py b/models/temporally_controllable_tta/picoaudio/data/filter_data.py
diff --git a/...ollable_tta/picoaudio/data/meta_data/test-frequency-control_onoffFromGpt_multi-event.json b/...ollable_tta/picoaudio/data/meta_data/test-frequency-control_onoffFromGpt_multi-event.json
diff --git a/...llable_tta/picoaudio/data/meta_data/test-frequency-control_onoffFromGpt_single-event.json b/...llable_tta/picoaudio/data/meta_data/test-frequency-control_onoffFromGpt_single-event.json
diff --git a/.../temporally_controllable_tta/picoaudio/data/meta_data/test-onoff-control_multi-event.json b/.../temporally_controllable_tta/picoaudio/data/meta_data/test-onoff-control_multi-event.json
diff --git a/...temporally_controllable_tta/picoaudio/data/meta_data/test-onoff-control_single-event.json b/...temporally_controllable_tta/picoaudio/data/meta_data/test-onoff-control_single-event.json
diff --git a/models/temporally_controllable_tta/picoaudio/data/meta_data/train.json b/models/temporally_controllable_tta/picoaudio/data/meta_data/train.json
diff --git a/models/temporally_controllable_tta/picoaudio/runner/controllable_inference.py b/models/temporally_controllable_tta/picoaudio/runner/controllable_inference.py
diff --git a/models/temporally_controllable_tta/picoaudio/utils/__pycache__/torch_tools.cpython-310.pyc b/models/temporally_controllable_tta/picoaudio/utils/__pycache__/torch_tools.cpython-310.pyc
diff --git a/models/temporally_controllable_tta/README.md → models/tta/picoaudio/README.md b/models/temporally_controllable_tta/README.md → models/tta/picoaudio/README.md
diff --git a/...llable_tta/picoaudio/audioldm/__init__.py → .../picoaudio/picoaudio/audioldm/__init__.py b/...llable_tta/picoaudio/audioldm/__init__.py → .../picoaudio/picoaudio/audioldm/__init__.py
@@ -1,8 +1,3 @@
 from .ldm import LatentDiffusion
 from .utils import seed_everything, save_wave, get_time, get_duration
 from .pipeline import *
-
-
-
-
-
diff --git a/...llable_tta/picoaudio/audioldm/__main__.py → .../picoaudio/picoaudio/audioldm/__main__.py b/...llable_tta/picoaudio/audioldm/__main__.py → .../picoaudio/picoaudio/audioldm/__main__.py
@@ -1,11 +1,19 @@
 #!/usr/bin/python3
 import os
-from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
+from audioldm import (
+    text_to_audio,
+    style_transfer,
+    build_model,
+    save_wave,
+    get_time,
+    round_up_duration,
+    get_duration,
+)
 import argparse
 
 CACHE_DIR = os.getenv(
-    "AUDIOLDM_CACHE_DIR",
-    os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
+    "AUDIOLDM_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache/audioldm")
+)
 
 parser = argparse.ArgumentParser()
 
@@ -15,7 +23,7 @@
     required=False,
     default="generation",
     help="generation: text-to-audio generation; transfer: style transfer",
-    choices=["generation", "transfer"]
+    choices=["generation", "transfer"],
 )
 
 parser.add_argument(
@@ -59,7 +67,7 @@
     required=False,
     help="The checkpoint you gonna use",
     default="audioldm-s-full",
-    choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
+    choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"],
 )
 
 parser.add_argument(
@@ -125,21 +133,21 @@
 
 args = parser.parse_args()
 
-if(args.ckpt_path is not None):
+if args.ckpt_path is not None:
     print("Warning: ckpt_path has no effect after version 0.0.20.")
-    
+
 assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
 
 mode = args.mode
-if(mode == "generation" and args.file_path is not None):
+if mode == "generation" and args.file_path is not None:
     mode = "generation_audio_to_audio"
-    if(len(args.text) > 0):
+    if len(args.text) > 0:
         print("Warning: You have specified the --file_path. --text will be ignored")
         args.text = ""
-        
+
 save_path = os.path.join(args.save_path, mode)
 
-if(args.file_path is not None):
+if args.file_path is not None:
     save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
 
 text = args.text
@@ -151,7 +159,7 @@
 os.makedirs(save_path, exist_ok=True)
 audioldm = build_model(model_name=args.model_name)
 
-if(args.mode == "generation"):
+if args.mode == "generation":
     waveform = text_to_audio(
         audioldm,
         text,
@@ -163,10 +171,13 @@
         n_candidate_gen_per_text=n_candidate_gen_per_text,
         batchsize=args.batchsize,
     )
-    
-elif(args.mode == "transfer"):
+
+elif args.mode == "transfer":
     assert args.file_path is not None
-    assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
+    assert os.path.exists(args.file_path), (
+        "The original audio file '%s' for style transfer does not exist."
+        % args.file_path
+    )
     waveform = style_transfer(
         audioldm,
         text,
@@ -178,6 +189,6 @@
         ddim_steps=args.ddim_steps,
         batchsize=args.batchsize,
     )
-    waveform = waveform[:,None,:]
+    waveform = waveform[:, None, :]
 
 save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))
diff --git a/..._tta/picoaudio/audioldm/audio/__init__.py → ...udio/picoaudio/audioldm/audio/__init__.py b/..._tta/picoaudio/audioldm/audio/__init__.py → ...udio/picoaudio/audioldm/audio/__init__.py
diff --git a/...oaudio/audioldm/audio/audio_processing.py → ...oaudio/audioldm/audio/audio_processing.py b/...oaudio/audioldm/audio/audio_processing.py → ...oaudio/audioldm/audio/audio_processing.py
diff --git a/...able_tta/picoaudio/audioldm/audio/stft.py → ...icoaudio/picoaudio/audioldm/audio/stft.py b/...able_tta/picoaudio/audioldm/audio/stft.py → ...icoaudio/picoaudio/audioldm/audio/stft.py
@@ -52,7 +52,7 @@ def __init__(self, filter_length, hop_length, win_length, window="hann"):
     def transform(self, input_data):
         device = self.forward_basis.device
         input_data = input_data.to(device)
-        
+
         num_batches = input_data.size(0)
         num_samples = input_data.size(1)
 
@@ -72,7 +72,7 @@ def transform(self, input_data):
             torch.autograd.Variable(self.forward_basis, requires_grad=False),
             stride=self.hop_length,
             padding=0,
-        )#.cpu()
+        )  # .cpu()
 
         cutoff = int((self.filter_length / 2) + 1)
         real_part = forward_transform[:, :cutoff, :]
@@ -86,7 +86,7 @@ def transform(self, input_data):
     def inverse(self, magnitude, phase):
         device = self.forward_basis.device
         magnitude, phase = magnitude.to(device), phase.to(device)
-        
+
         recombine_magnitude_phase = torch.cat(
             [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
         )
@@ -149,7 +149,11 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.stft_fn = STFT(filter_length, hop_length, win_length)
         mel_basis = librosa_mel_fn(
-            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax
+            sr=sampling_rate,
+            n_fft=filter_length,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
         )
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer("mel_basis", mel_basis)

diff --git a/...ble_tta/picoaudio/audioldm/audio/tools.py → ...coaudio/picoaudio/audioldm/audio/tools.py b/...ble_tta/picoaudio/audioldm/audio/tools.py → ...coaudio/picoaudio/audioldm/audio/tools.py
@@ -43,6 +43,7 @@ def pad_wav(waveform, segment_length):
         temp_wav[:, :waveform_length] = waveform
     return temp_wav
 
+
 def normalize_wav(waveform):
     waveform = waveform - np.mean(waveform)
     waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
@@ -57,10 +58,10 @@ def read_wav_file(filename, segment_length):
     waveform = normalize_wav(waveform)
     waveform = waveform[None, ...]
     waveform = pad_wav(waveform, segment_length)
-    
+
     waveform = waveform / np.max(np.abs(waveform))
     waveform = 0.5 * waveform
-    
+
     return waveform
 
 

diff --git a/...e_tta/picoaudio/audioldm/clap/__init__.py → ...audio/picoaudio/audioldm/clap/__init__.py b/...e_tta/picoaudio/audioldm/clap/__init__.py → ...audio/picoaudio/audioldm/clap/__init__.py
diff --git a/...e_tta/picoaudio/audioldm/clap/encoders.py → ...audio/picoaudio/audioldm/clap/encoders.py b/...e_tta/picoaudio/audioldm/clap/encoders.py → ...audio/picoaudio/audioldm/clap/encoders.py
@@ -14,7 +14,7 @@ def __init__(
         key="class",
         sampling_rate=16000,
         embed_mode="audio",
-        amodel = "HTSAT-tiny",
+        amodel="HTSAT-tiny",
         unconditional_prob=0.1,
         random_mute=False,
         max_random_mute_portion=0.5,
@@ -92,7 +92,11 @@ def cos_similarity(self, waveform, text):
             audio_emb = self(waveform.cuda())
             self.embed_mode = "text"
             text_emb = self(text)
-            similarity = F.cosine_similarity(audio_emb, text_emb, dim=2), audio_emb, text_emb
+            similarity = (
+                F.cosine_similarity(audio_emb, text_emb, dim=2),
+                audio_emb,
+                text_emb,
+            )
             return similarity.squeeze()
 
     def forward(self, batch, key=None):
@@ -167,4 +171,4 @@ def tokenizer(self, text):
             max_length=512,
             return_tensors="pt",
         )
-        return {k: v.squeeze(0) for k, v in result.items()}
+        return {k: v.squeeze(0) for k, v in result.items()}
diff --git a/...audio/audioldm/clap/open_clip/__init__.py → ...audio/audioldm/clap/open_clip/__init__.py b/...audio/audioldm/clap/open_clip/__init__.py → ...audio/audioldm/clap/open_clip/__init__.py
diff --git a/...picoaudio/audioldm/clap/open_clip/bert.py → ...picoaudio/audioldm/clap/open_clip/bert.py b/...picoaudio/audioldm/clap/open_clip/bert.py → ...picoaudio/audioldm/clap/open_clip/bert.py
diff --git a/...ap/open_clip/bpe_simple_vocab_16e6.txt.gz → ...ap/open_clip/bpe_simple_vocab_16e6.txt.gz b/...ap/open_clip/bpe_simple_vocab_16e6.txt.gz → ...ap/open_clip/bpe_simple_vocab_16e6.txt.gz