Spaces:

k2-fsa
/

OmniVoice

Running on Zero

App Files Files Community

zhu-han commited on 8 days ago

Commit

9e4e0d2

1 Parent(s): 32ffa33

update to 0.1.4 version

Browse files

Files changed (17) hide show

omnivoice/cli/demo.py +1 -2
omnivoice/cli/infer.py +3 -2
omnivoice/cli/infer_batch.py +84 -54
omnivoice/data/dataset.py +6 -13
omnivoice/eval/mos/utmos.py +2 -2
omnivoice/eval/speaker_similarity/sim.py +2 -2
omnivoice/eval/utils.py +1 -1
omnivoice/eval/wer/hubert.py +2 -2
omnivoice/eval/wer/minimax.py +2 -2
omnivoice/eval/wer/seedtts.py +2 -2
omnivoice/models/omnivoice.py +46 -32
omnivoice/scripts/denoise_audio.py +5 -4
omnivoice/scripts/extract_audio_tokens_add_noise.py +2 -8
omnivoice/scripts/jsonl_to_webdataset.py +7 -4
omnivoice/utils/audio.py +166 -181
omnivoice/utils/data_utils.py +5 -4
requirements.txt +1 -0

omnivoice/cli/demo.py CHANGED Viewed

@@ -213,8 +213,7 @@ def build_demo(
         except Exception as e:
             return None, f"Error: {type(e).__name__}: {e}"
-        waveform = audio[0].squeeze(0).numpy()  # (T,)
-        waveform = (waveform * 32767).astype(np.int16)
         return (sampling_rate, waveform), "Done."
     # Allow external wrappers (e.g. spaces.GPU for ZeroGPU Spaces)

         except Exception as e:
             return None, f"Error: {type(e).__name__}: {e}"
+        waveform = (audio[0] * 32767).astype(np.int16)
         return (sampling_rate, waveform), "Done."
     # Allow external wrappers (e.g. spaces.GPU for ZeroGPU Spaces)

omnivoice/cli/infer.py CHANGED Viewed

@@ -23,7 +23,8 @@ import argparse
 import logging
 import torch
-import torchaudio
 from omnivoice.models.omnivoice import OmniVoice
 from omnivoice.utils.common import str2bool
@@ -149,7 +150,7 @@ def main():
         class_temperature=args.class_temperature,
     )
-    torchaudio.save(args.output, audios[0], model.sampling_rate)
     logging.info(f"Saved to {args.output}")

 import logging
 import torch
+import soundfile as sf
 from omnivoice.models.omnivoice import OmniVoice
 from omnivoice.utils.common import str2bool
         class_temperature=args.class_temperature,
     )
+    sf.write(args.output, audios[0], model.sampling_rate)
     logging.info(f"Saved to {args.output}")

omnivoice/cli/infer_batch.py CHANGED Viewed

@@ -42,10 +42,11 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import List, Optional, Tuple
 import torch
-import torchaudio
 from tqdm import tqdm
 from omnivoice.models.omnivoice import OmniVoice
 from omnivoice.utils.audio import load_audio
 from omnivoice.utils.common import str2bool
 from omnivoice.utils.data_utils import read_test_list
@@ -79,11 +80,17 @@ def get_parser():
         type=str,
         required=True,
         help="Path to the JSONL file containing test samples. "
-        'Each line is a JSON object: {"id": "name", "text": "...", '
-        '"ref_audio": "/path.wav", "ref_text": "...", '
-        '"language_id": "en", "language_name": "English", '
-        '"duration": 10.0, "speed": 1.2}. '
-        "language_id, language_name, duration, and speed are optional.",
     )
     parser.add_argument(
         "--res_dir",
@@ -135,8 +142,7 @@ def get_parser():
         "--batch_duration",
         type=float,
         default=1000.0,
-        help="Maximum total duration (reference + generated) per batch (seconds). "
-        "Only effective for parallel_chunk / no chunk mode.",
     )
     parser.add_argument(
         "--batch_size",
@@ -239,7 +245,7 @@ def process_init(rank_queue, model_checkpoint, warmup=0):
         dummy_ref_audio = (
             torch.randn(1, SAMPLING_RATE),
             SAMPLING_RATE,
-        )  # 1s silence
         for i in range(warmup):
             worker_model.generate(
                 text=["hello"],
@@ -255,40 +261,58 @@ def process_init(rank_queue, model_checkpoint, warmup=0):
 def estimate_sample_total_duration(
     duration_estimator: RuleDurationEstimator,
     text: str,
-    ref_text: str,
-    ref_audio_path: str,
     gen_duration: Optional[float] = None,
 ) -> float:
-    ref_wav = load_audio(ref_audio_path, SAMPLING_RATE)
-    ref_duration = ref_wav.shape[-1] / SAMPLING_RATE
     if gen_duration is None:
-        gen_duration = duration_estimator.estimate_duration(
-            text, ref_text, ref_duration, low_threshold=2.0
-        )
     total_duration = ref_duration + gen_duration
     return total_duration
-def cluster_samples_by_duration(
     samples: List[Tuple],
     duration_estimator: RuleDurationEstimator,
-    batch_duration: float,
-) -> List[List[Tuple]]:
     sample_with_duration = []
     for sample in samples:
-        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd = sample
         total_duration = estimate_sample_total_duration(
-            duration_estimator,
-            text,
-            ref_text,
-            ref_audio_path,
-            gen_duration=dur,
         )
         sample_with_duration.append((sample, total_duration))
     sample_with_duration.sort(key=lambda x: x[1], reverse=True)
     batches = []
     current_batch = []
     current_total_duration = 0.0
@@ -319,19 +343,7 @@ def cluster_samples_by_batch_size(
     batch_size: int,
 ) -> List[List[Tuple]]:
     """Split samples into fixed-size batches, sorted by duration to minimize padding."""
-    sample_with_duration = []
-    for sample in samples:
-        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd = sample
-        total_duration = estimate_sample_total_duration(
-            duration_estimator,
-            text,
-            ref_text,
-            ref_audio_path,
-            gen_duration=dur,
-        )
-        sample_with_duration.append((sample, total_duration))
-    sample_with_duration.sort(key=lambda x: x[1], reverse=True)
     sorted_samples = [s for s, _ in sample_with_duration]
     batches = [
@@ -359,9 +371,10 @@ def run_inference_batch(
     langs = []
     durations = []
     speeds = []
     for sample in batch_samples:
-        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd = sample
         save_names.append(save_name)
         ref_texts.append(ref_text)
         ref_audio_paths.append(ref_audio_path)
@@ -369,15 +382,17 @@ def run_inference_batch(
         langs.append(lang_id)
         durations.append(dur)
         speeds.append(spd)
     start_time = time.time()
     audios = worker_model.generate(
         text=texts,
         language=langs,
-        ref_audio=ref_audio_paths,
-        ref_text=ref_texts,
         duration=durations if any(d is not None for d in durations) else None,
         speed=speeds if any(s is not None for s in speeds) else None,
         **gen_kwargs,
     )
     batch_synth_time = time.time() - start_time
@@ -385,7 +400,7 @@ def run_inference_batch(
     results = []
     for save_name, audio in zip(save_names, audios):
         save_path = os.path.join(res_dir, save_name + ".wav")
-        torchaudio.save(save_path, audio, worker_model.sampling_rate)
         audio_duration = audio.shape[-1] / worker_model.sampling_rate
         results.append(
             (
@@ -436,13 +451,14 @@ def main():
         samples.append(
             (
                 s["id"],
-                s["ref_text"],
-                s["ref_audio"],
                 s["text"],
                 lang_id,
                 lang_name,
                 s.get("duration"),
                 s.get("speed"),
             )
         )
@@ -457,18 +473,32 @@ def main():
         ) as executor:
             futures = []
-            # parallel_chunk / no chunk
             logging.info("Running batch inference")
             duration_estimator = RuleDurationEstimator()
-            if args.batch_size > 0:
-                batches = cluster_samples_by_batch_size(
-                    samples, duration_estimator, args.batch_size
-                )
-            else:
-                batches = cluster_samples_by_duration(
-                    samples, duration_estimator, args.batch_duration
-                )
             args_dict = vars(args)

 from typing import List, Optional, Tuple
 import torch
 from tqdm import tqdm
 from omnivoice.models.omnivoice import OmniVoice
+import soundfile as sf
 from omnivoice.utils.audio import load_audio
 from omnivoice.utils.common import str2bool
 from omnivoice.utils.data_utils import read_test_list
         type=str,
         required=True,
         help="Path to the JSONL file containing test samples. "
+        "Each line is a JSON object with the following fields: "
+        '"id" (str, required): unique name for the output file; '
+        '"text" (str, required): text to synthesize; '
+        '"ref_audio" (str): path to reference audio for voice cloning; '
+        '"ref_text" (str): transcript of the reference audio; '
+        '"instruct" (str): instruction for voice design (used when ref_audio is absent); '
+        '"language_id" (str): language code, e.g. "en"; '
+        '"language_name" (str): language name, e.g. "English"; '
+        '"duration" (float): target duration in seconds; '
+        '"speed" (float): speaking speed multiplier. '
+        "Only id and text are required; all other fields are optional.",
     )
     parser.add_argument(
         "--res_dir",
         "--batch_duration",
         type=float,
         default=1000.0,
+        help="Maximum total duration (reference + generated) per batch (seconds).",
     )
     parser.add_argument(
         "--batch_size",
         dummy_ref_audio = (
             torch.randn(1, SAMPLING_RATE),
             SAMPLING_RATE,
+        )  # 1s dummy audio
         for i in range(warmup):
             worker_model.generate(
                 text=["hello"],
 def estimate_sample_total_duration(
     duration_estimator: RuleDurationEstimator,
     text: str,
+    ref_text: Optional[str],
+    ref_audio_path: Optional[str],
     gen_duration: Optional[float] = None,
 ) -> float:
+    """Estimate total duration (ref + generated) for a single sample.
+    When ``ref_audio_path`` is ``None`` (instruct / voice-design mode),
+    the reference duration is treated as 0 and only the estimated generated
+    duration contributes to the total.
+    """
+    if ref_audio_path is not None:
+        ref_wav = load_audio(ref_audio_path, SAMPLING_RATE)
+        ref_duration = ref_wav.shape[-1] / SAMPLING_RATE
+    else:
+        ref_duration = 0
     if gen_duration is None:
+        if ref_audio_path is not None:
+            gen_duration = duration_estimator.estimate_duration(
+                text, ref_text or "", ref_duration, low_threshold=2.0
+            )
+        else:
+            gen_duration = duration_estimator.estimate_duration(
+                text, "Nice to meet you.", 0.5, low_threshold=2.0
+            )
     total_duration = ref_duration + gen_duration
     return total_duration
+def _sort_samples_by_duration(
     samples: List[Tuple],
     duration_estimator: RuleDurationEstimator,
+) -> List[Tuple[Tuple, float]]:
+    """Return (sample, total_duration) pairs sorted by duration descending."""
     sample_with_duration = []
     for sample in samples:
+        _, ref_text, ref_audio_path, text, _, _, dur, _, _ = sample
         total_duration = estimate_sample_total_duration(
+            duration_estimator, text, ref_text, ref_audio_path, gen_duration=dur
         )
         sample_with_duration.append((sample, total_duration))
     sample_with_duration.sort(key=lambda x: x[1], reverse=True)
+    return sample_with_duration
+def cluster_samples_by_duration(
+    samples: List[Tuple],
+    duration_estimator: RuleDurationEstimator,
+    batch_duration: float,
+) -> List[List[Tuple]]:
+    sample_with_duration = _sort_samples_by_duration(samples, duration_estimator)
     batches = []
     current_batch = []
     current_total_duration = 0.0
     batch_size: int,
 ) -> List[List[Tuple]]:
     """Split samples into fixed-size batches, sorted by duration to minimize padding."""
+    sample_with_duration = _sort_samples_by_duration(samples, duration_estimator)
     sorted_samples = [s for s, _ in sample_with_duration]
     batches = [
     langs = []
     durations = []
     speeds = []
+    instructs = []
     for sample in batch_samples:
+        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd, instruct = sample
         save_names.append(save_name)
         ref_texts.append(ref_text)
         ref_audio_paths.append(ref_audio_path)
         langs.append(lang_id)
         durations.append(dur)
         speeds.append(spd)
+        instructs.append(instruct)
     start_time = time.time()
     audios = worker_model.generate(
         text=texts,
         language=langs,
+        ref_audio=ref_audio_paths if any(p is not None for p in ref_audio_paths) else None,
+        ref_text=ref_texts if any(t is not None for t in ref_texts) else None,
         duration=durations if any(d is not None for d in durations) else None,
         speed=speeds if any(s is not None for s in speeds) else None,
+        instruct=instructs if any(i is not None for i in instructs) else None,
         **gen_kwargs,
     )
     batch_synth_time = time.time() - start_time
     results = []
     for save_name, audio in zip(save_names, audios):
         save_path = os.path.join(res_dir, save_name + ".wav")
+        sf.write(save_path, audio, worker_model.sampling_rate)
         audio_duration = audio.shape[-1] / worker_model.sampling_rate
         results.append(
             (
         samples.append(
             (
                 s["id"],
+                s.get("ref_text"),
+                s.get("ref_audio"),
                 s["text"],
                 lang_id,
                 lang_name,
                 s.get("duration"),
                 s.get("speed"),
+                s.get("instruct"),
             )
         )
         ) as executor:
             futures = []
             logging.info("Running batch inference")
+            # Split samples by mode (voice-clone vs non-voice-clone) before
+            # clustering so that each batch is homogeneous.  Mixing ref_audio
+            # and non-ref_audio samples in the same batch would crash in
+            # generate() → create_voice_clone_prompt().
+            clone_samples = [s for s in samples if s[2] is not None]
+            other_samples = [s for s in samples if s[2] is None]
             duration_estimator = RuleDurationEstimator()
+            batches = []
+            for subset in (clone_samples, other_samples):
+                if not subset:
+                    continue
+                if args.batch_size > 0:
+                    batches.extend(
+                        cluster_samples_by_batch_size(
+                            subset, duration_estimator, args.batch_size
+                        )
+                    )
+                else:
+                    batches.extend(
+                        cluster_samples_by_duration(
+                            subset, duration_estimator, args.batch_duration
+                        )
+                    )
             args_dict = vars(args)

omnivoice/data/dataset.py CHANGED Viewed

@@ -44,8 +44,9 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple
 import torch
 import torch.distributed as dist
-import torchaudio
 import webdataset as wds
 from torch.utils.data import IterableDataset
@@ -54,12 +55,8 @@ def load_audio_webdataset(data, sample_rate: int = 24000, device="cpu"):
     Load audio from bytes data and resample to the target sample rate if needed.
     Return a tensor of shape (1, num_samples)
     """
-    audio, sr = torchaudio.load(io.BytesIO(data))
     audio = audio.to(device)
-    if audio.size(dim=0) > 1:
-        audio = torch.mean(audio, dim=0)
-    if sr != sample_rate:
-        audio = torchaudio.functional.resample(audio, sr, sample_rate)
     return audio
@@ -433,13 +430,9 @@ class JsonlDatasetReader(IterableDataReader):
                 )
                 continue
             try:
-                waveform, sr = torchaudio.load(audio_path)
-                if waveform.shape[0] > 1:
-                    waveform = waveform.mean(dim=0, keepdim=True)
-                if sr != self.sample_rate:
-                    waveform = torchaudio.functional.resample(
-                        waveform, sr, self.sample_rate
-                    )
                 if self.normalize_audio:
                     waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.9
                 meta["audio_duration"] = waveform.shape[1] / self.sample_rate

 import torch
 import torch.distributed as dist
 import webdataset as wds
+from omnivoice.utils.audio import load_audio, load_audio_bytes
 from torch.utils.data import IterableDataset
     Load audio from bytes data and resample to the target sample rate if needed.
     Return a tensor of shape (1, num_samples)
     """
+    audio = torch.from_numpy(load_audio_bytes(data, sample_rate))
     audio = audio.to(device)
     return audio
                 )
                 continue
             try:
+                waveform = torch.from_numpy(
+                    load_audio(audio_path, self.sample_rate)
+                )
                 if self.normalize_audio:
                     waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.9
                 meta["audio_duration"] = waveform.shape[1] / self.sample_rate

omnivoice/eval/mos/utmos.py CHANGED Viewed

@@ -32,7 +32,7 @@ import torch
 from tqdm import tqdm
 from omnivoice.eval.models.utmos import UTMOS22Strong
-from omnivoice.eval.utils import load_waveform
 from omnivoice.utils.data_utils import read_test_list
 warnings.filterwarnings("ignore")
@@ -140,7 +140,7 @@ def run_utmos_worker(file_idx, wav_path, language_name):
             return file_idx, wav_path, language_name, f"File not found: {wav_path}", "error"
         # Load and preprocess waveform
-        speech = load_waveform(wav_path, worker_sr, device=worker_device)
         # Compute score
         # UTMOS expects input shape (Batch, Time)

 from tqdm import tqdm
 from omnivoice.eval.models.utmos import UTMOS22Strong
+from omnivoice.eval.utils import load_eval_waveform
 from omnivoice.utils.data_utils import read_test_list
 warnings.filterwarnings("ignore")
             return file_idx, wav_path, language_name, f"File not found: {wav_path}", "error"
         # Load and preprocess waveform
+        speech = load_eval_waveform(wav_path, worker_sr, device=worker_device)
         # Compute score
         # UTMOS expects input shape (Batch, Time)

omnivoice/eval/speaker_similarity/sim.py CHANGED Viewed

@@ -33,7 +33,7 @@ import torch
 from tqdm import tqdm
 from omnivoice.eval.models.ecapa_tdnn_wavlm import ECAPA_TDNN_WAVLM
-from omnivoice.eval.utils import load_waveform
 from omnivoice.utils.data_utils import read_test_list
 warnings.filterwarnings("ignore")
@@ -144,7 +144,7 @@ def worker_init(
 @torch.no_grad()
 def get_embedding(wav_path: str) -> torch.Tensor:
     """Extract embedding for a single file."""
-    speech = load_waveform(wav_path, worker_sr, device=worker_device, max_seconds=120)
     return worker_model([speech])

 from tqdm import tqdm
 from omnivoice.eval.models.ecapa_tdnn_wavlm import ECAPA_TDNN_WAVLM
+from omnivoice.eval.utils import load_eval_waveform
 from omnivoice.utils.data_utils import read_test_list
 warnings.filterwarnings("ignore")
 @torch.no_grad()
 def get_embedding(wav_path: str) -> torch.Tensor:
     """Extract embedding for a single file."""
+    speech = load_eval_waveform(wav_path, worker_sr, device=worker_device, max_seconds=120)
     return worker_model([speech])

omnivoice/eval/utils.py CHANGED Viewed

@@ -23,7 +23,7 @@ import soundfile as sf
 import torch
-def load_waveform(
     fname: str,
     sample_rate: int,
     dtype: str = "float32",

 import torch
+def load_eval_waveform(
     fname: str,
     sample_rate: int,
     dtype: str = "float32",

omnivoice/eval/wer/hubert.py CHANGED Viewed

@@ -31,7 +31,7 @@ import numpy as np
 import torch
 from tqdm import tqdm
-from omnivoice.eval.utils import load_waveform
 from omnivoice.eval.wer.common import process_one
 from omnivoice.utils.data_utils import read_test_list
@@ -166,7 +166,7 @@ def run_eval_worker(data_chunk, batch_size):
     try:
         dataset = [
             {
-                "array": load_waveform(
                     item["wav_path"], sample_rate=16000, return_numpy=True
                 ),
                 "sampling_rate": 16000,

 import torch
 from tqdm import tqdm
+from omnivoice.eval.utils import load_eval_waveform
 from omnivoice.eval.wer.common import process_one
 from omnivoice.utils.data_utils import read_test_list
     try:
         dataset = [
             {
+                "array": load_eval_waveform(
                     item["wav_path"], sample_rate=16000, return_numpy=True
                 ),
                 "sampling_rate": 16000,

omnivoice/eval/wer/minimax.py CHANGED Viewed

@@ -34,7 +34,7 @@ import torch
 import zhconv
 from tqdm import tqdm
-from omnivoice.eval.utils import load_waveform
 from omnivoice.eval.wer.common import log_metrics, process_one
 from omnivoice.eval.wer.text_norm_omni import text_normalize
 from omnivoice.utils.data_utils import read_test_list
@@ -275,7 +275,7 @@ class SpeechEvalDataset(torch.utils.data.Dataset):
     def __getitem__(self, index):
         item = self.data_list[index]
-        waveform = load_waveform(item["wav_path"], sample_rate=16000, return_numpy=True)
         return {
             "array": waveform,
             "sampling_rate": 16000,

 import zhconv
 from tqdm import tqdm
+from omnivoice.eval.utils import load_eval_waveform
 from omnivoice.eval.wer.common import log_metrics, process_one
 from omnivoice.eval.wer.text_norm_omni import text_normalize
 from omnivoice.utils.data_utils import read_test_list
     def __getitem__(self, index):
         item = self.data_list[index]
+        waveform = load_eval_waveform(item["wav_path"], sample_rate=16000, return_numpy=True)
         return {
             "array": waveform,
             "sampling_rate": 16000,

omnivoice/eval/wer/seedtts.py CHANGED Viewed

@@ -34,7 +34,7 @@ import zhconv
 from tqdm import tqdm
 from zhon.hanzi import punctuation
-from omnivoice.eval.utils import load_waveform
 from omnivoice.eval.wer.common import process_one
 from omnivoice.utils.data_utils import read_test_list
@@ -228,7 +228,7 @@ def run_eval_worker(data_chunk, lang, batch_size):
             # Load waveforms as arrays, truncating to 30s
             dataset = [
                 {
-                    "array": load_waveform(
                         item["wav_path"], sample_rate=16000, return_numpy=True
                     )[: 16000 * 30],
                     "sampling_rate": 16000,

 from tqdm import tqdm
 from zhon.hanzi import punctuation
+from omnivoice.eval.utils import load_eval_waveform
 from omnivoice.eval.wer.common import process_one
 from omnivoice.utils.data_utils import read_test_list
             # Load waveforms as arrays, truncating to 30s
             dataset = [
                 {
+                    "array": load_eval_waveform(
                         item["wav_path"], sample_rate=16000, return_numpy=True
                     )[: 16000 * 30],
                     "sampling_rate": 16000,

omnivoice/models/omnivoice.py CHANGED Viewed

@@ -36,10 +36,11 @@ from dataclasses import dataclass, fields
 from functools import partial
 from typing import Any, List, Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torchaudio
 from torch.nn.attention.flex_attention import create_block_mask
 from transformers import (
     AutoFeatureExtractor,
@@ -310,12 +311,14 @@ class OmniVoice(PreTrainedModel):
     @torch.inference_mode()
     def transcribe(
         self,
-        audio: Union[str, tuple[torch.Tensor, int]],
     ) -> str:
         """Transcribe audio using the loaded Whisper ASR model.
         Args:
-            audio: File path or (waveform, sample_rate) tuple.
         Returns:
             Transcribed text.
@@ -329,12 +332,11 @@ class OmniVoice(PreTrainedModel):
             return self._asr_pipe(audio)["text"].strip()
         else:
             waveform, sr = audio
-            if waveform.dim() == 1:
-                waveform = waveform.unsqueeze(0)
-            if waveform.size(0) > 1:
-                waveform = torch.mean(waveform, dim=0, keepdim=True)
             audio_input = {
-                "array": waveform.squeeze(0).cpu().numpy(),
                 "sampling_rate": sr,
             }
             return self._asr_pipe(audio_input)["text"].strip()
@@ -475,7 +477,7 @@ class OmniVoice(PreTrainedModel):
         speed: Union[float, list[Optional[float]], None] = None,
         generation_config: Optional[OmniVoiceGenerationConfig] = None,
         **kwargs,
-    ) -> list[torch.Tensor]:
         """Generate speech audio given text in various modes.
         Supports three modes:
@@ -522,8 +524,10 @@ class OmniVoice(PreTrainedModel):
                 audio_chunk_threshold: Only apply chunking if estimated audio
                     duration exceeds this threshold (seconds).
         Returns:
-            ``audios`` a list of 2-D ``torch.Tensor``, with the shape (1, T) and sampling rate
-            consistent with the model's audio tokenizer (usually 24000 Hz).
         """
         if self.audio_tokenizer is None or self.text_tokenizer is None:
@@ -611,17 +615,19 @@ class OmniVoice(PreTrainedModel):
             ref_wav = load_audio(ref_audio, self.sampling_rate)
         else:
             waveform, sr = ref_audio
-            if waveform.dim() == 1:
-                waveform = waveform.unsqueeze(0)
-            if waveform.size(0) > 1:
-                waveform = torch.mean(waveform, dim=0, keepdim=True)
             if sr != self.sampling_rate:
-                waveform = torchaudio.functional.resample(
-                    waveform, sr, self.sampling_rate
                 )
             ref_wav = waveform
-        ref_rms = torch.sqrt(torch.mean(torch.square(ref_wav))).item()
         if 0 < ref_rms < 0.1:
             ref_wav = ref_wav * 0.1 / ref_rms
@@ -640,13 +646,13 @@ class OmniVoice(PreTrainedModel):
                 lead_sil=100,
                 trail_sil=200,
             )
-            if ref_wav.size(-1) == 0:
                 raise ValueError(
                     "Reference audio is empty after silence removal. "
                     "Try setting preprocess_prompt=False."
                 )
-        ref_duration = ref_wav.size(-1) / self.sampling_rate
         if ref_duration > 20.0:
             logger.warning(
                 "Reference audio is %.1fs long (>20s). This may cause slower "
@@ -664,10 +670,14 @@ class OmniVoice(PreTrainedModel):
             logger.debug("Auto-transcribed ref_text: %s", ref_text)
         chunk_size = self.audio_tokenizer.config.hop_length
-        clip_size = int(ref_wav.size(-1) % chunk_size)
         ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav
         ref_audio_tokens = self.audio_tokenizer.encode(
-            ref_wav.unsqueeze(0).to(self.audio_tokenizer.device),
         ).audio_codes.squeeze(
             0
         )  # (C, T)
@@ -686,7 +696,7 @@ class OmniVoice(PreTrainedModel):
         tokens: Union[torch.Tensor, List[torch.Tensor]],
         rms: Union[float, None],
         gen_config: OmniVoiceGenerationConfig,
-    ) -> torch.Tensor:
         """
         Args:
             tokens: Audio tokens — either a single tensor of shape
@@ -694,7 +704,7 @@ class OmniVoice(PreTrainedModel):
             rms: RMS of the reference audio for volume adjustment.
             gen_config: Generation config for post-processing options.
         Returns:
-            Decoded and post-processed audio tensor of shape (1, T).
         """
         tokenizer_device = self.audio_tokenizer.device
         if isinstance(tokens, list):
@@ -702,6 +712,7 @@ class OmniVoice(PreTrainedModel):
                 self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0))
                 .audio_values[0]
                 .cpu()
                 for t in tokens
             ]
             audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate)
@@ -710,28 +721,30 @@ class OmniVoice(PreTrainedModel):
                 self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0))
                 .audio_values[0]
                 .cpu()
             )
-        return self._post_process_audio(
             audio_waveform,
             postprocess_output=gen_config.postprocess_output,
             ref_rms=rms,
         )
     def _post_process_audio(
         self,
-        generated_audio: torch.Tensor,
         postprocess_output: bool,
         ref_rms: Union[float, None],
-    ) -> torch.Tensor:
         """Optionally remove long silences, adjust volume, and add edge padding.
         Args:
-            generated_audio: Audio tensor of shape (1, T).
             postprocess_output: If True, remove long silences and apply fade/pad.
             ref_rms: RMS of the reference audio for volume normalisation.
         Returns:
-            Processed audio tensor of shape (1, T).
         """
         if postprocess_output:
             generated_audio = remove_silence(
@@ -745,9 +758,7 @@ class OmniVoice(PreTrainedModel):
         if ref_rms is not None and ref_rms < 0.1:
             generated_audio = generated_audio * ref_rms / 0.1
         elif ref_rms is None:
-            # No reference audio (voice design): peak-normalize to 0.5
-            # to avoid clipping while keeping a comfortable volume level.
-            peak = generated_audio.abs().max()
             if peak > 1e-6:
                 generated_audio = generated_audio / peak * 0.5
@@ -1549,6 +1560,9 @@ def _combine_text(text, ref_text: Optional[str] = None) -> str:
     # filter out newline / carriage-return characters
     full_text = re.sub(r"[\r\n]+", "", full_text)
     # collapse consecutive spaces / tabs into a single space
     full_text = re.sub(r"[ \t]+", " ", full_text)

 from functools import partial
 from typing import Any, List, Optional, Union
+import librosa
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.attention.flex_attention import create_block_mask
 from transformers import (
     AutoFeatureExtractor,
     @torch.inference_mode()
     def transcribe(
         self,
+        audio: Union[str, tuple],
     ) -> str:
         """Transcribe audio using the loaded Whisper ASR model.
         Args:
+            audio: File path or ``(waveform, sample_rate)`` tuple.
+                Waveform can be a numpy array or torch.Tensor of shape
+                ``(1, T)`` or ``(T,)``.
         Returns:
             Transcribed text.
             return self._asr_pipe(audio)["text"].strip()
         else:
             waveform, sr = audio
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            waveform = np.squeeze(waveform)  # (1, T) or (T,) → (T,)
             audio_input = {
+                "array": waveform,
                 "sampling_rate": sr,
             }
             return self._asr_pipe(audio_input)["text"].strip()
         speed: Union[float, list[Optional[float]], None] = None,
         generation_config: Optional[OmniVoiceGenerationConfig] = None,
         **kwargs,
+    ) -> list[np.ndarray]:
         """Generate speech audio given text in various modes.
         Supports three modes:
                 audio_chunk_threshold: Only apply chunking if estimated audio
                     duration exceeds this threshold (seconds).
         Returns:
+            ``audios`` a list of 1-D ``np.ndarray`` with shape ``(T,)`` and
+            sampling rate consistent with the model's audio tokenizer
+            (usually 24 000 Hz).  Can be saved directly with
+            ``soundfile.write("out.wav", audios[0], model.sampling_rate)``.
         """
         if self.audio_tokenizer is None or self.text_tokenizer is None:
             ref_wav = load_audio(ref_audio, self.sampling_rate)
         else:
             waveform, sr = ref_audio
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            if waveform.ndim == 1:
+                waveform = waveform[np.newaxis, :]
+            if waveform.shape[0] > 1:
+                waveform = np.mean(waveform, axis=0, keepdims=True)
             if sr != self.sampling_rate:
+                waveform = librosa.resample(
+                    waveform, orig_sr=sr, target_sr=self.sampling_rate,
                 )
             ref_wav = waveform
+        ref_rms = float(np.sqrt(np.mean(ref_wav ** 2)))
         if 0 < ref_rms < 0.1:
             ref_wav = ref_wav * 0.1 / ref_rms
                 lead_sil=100,
                 trail_sil=200,
             )
+            if ref_wav.shape[-1] == 0:
                 raise ValueError(
                     "Reference audio is empty after silence removal. "
                     "Try setting preprocess_prompt=False."
                 )
+        ref_duration = ref_wav.shape[-1] / self.sampling_rate
         if ref_duration > 20.0:
             logger.warning(
                 "Reference audio is %.1fs long (>20s). This may cause slower "
             logger.debug("Auto-transcribed ref_text: %s", ref_text)
         chunk_size = self.audio_tokenizer.config.hop_length
+        clip_size = int(ref_wav.shape[-1] % chunk_size)
         ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav
+        # numpy → torch at tokenizer boundary
+        ref_wav_tensor = torch.from_numpy(ref_wav).to(
+            self.audio_tokenizer.device
+        )
         ref_audio_tokens = self.audio_tokenizer.encode(
+            ref_wav_tensor.unsqueeze(0),
         ).audio_codes.squeeze(
             0
         )  # (C, T)
         tokens: Union[torch.Tensor, List[torch.Tensor]],
         rms: Union[float, None],
         gen_config: OmniVoiceGenerationConfig,
+    ) -> np.ndarray:
         """
         Args:
             tokens: Audio tokens — either a single tensor of shape
             rms: RMS of the reference audio for volume adjustment.
             gen_config: Generation config for post-processing options.
         Returns:
+            Decoded and post-processed audio array of shape (T,).
         """
         tokenizer_device = self.audio_tokenizer.device
         if isinstance(tokens, list):
                 self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0))
                 .audio_values[0]
                 .cpu()
+                .numpy()
                 for t in tokens
             ]
             audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate)
                 self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0))
                 .audio_values[0]
                 .cpu()
+                .numpy()
             )
+        audio_waveform = self._post_process_audio(
             audio_waveform,
             postprocess_output=gen_config.postprocess_output,
             ref_rms=rms,
         )
+        return audio_waveform.squeeze(0)
     def _post_process_audio(
         self,
+        generated_audio: np.ndarray,
         postprocess_output: bool,
         ref_rms: Union[float, None],
+    ) -> np.ndarray:
         """Optionally remove long silences, adjust volume, and add edge padding.
         Args:
+            generated_audio: Numpy array of shape (1, T).
             postprocess_output: If True, remove long silences and apply fade/pad.
             ref_rms: RMS of the reference audio for volume normalisation.
         Returns:
+            Processed numpy array of shape (1, T).
         """
         if postprocess_output:
             generated_audio = remove_silence(
         if ref_rms is not None and ref_rms < 0.1:
             generated_audio = generated_audio * ref_rms / 0.1
         elif ref_rms is None:
+            peak = np.abs(generated_audio).max()
             if peak > 1e-6:
                 generated_audio = generated_audio / peak * 0.5
     # filter out newline / carriage-return characters
     full_text = re.sub(r"[\r\n]+", "", full_text)
+    # replace Chinese parentheses with English ones
+    full_text = full_text.replace("\uff08", "(").replace("\uff09", ")")
     # collapse consecutive spaces / tabs into a single space
     full_text = re.sub(r"[ \t]+", " ", full_text)

omnivoice/scripts/denoise_audio.py CHANGED Viewed

@@ -73,6 +73,7 @@ from tqdm.auto import tqdm
 from omnivoice.data.batching import StreamLengthGroupDataset
 from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
 from omnivoice.utils.common import str2bool
 SIDON_INPUT_SAMPLE_RATE = 16_000
@@ -367,10 +368,10 @@ def extract_seamless_m4t_features(
 def serialise_flac(key: str, waveform: torch.Tensor, sample_rate: int) -> dict:
     buffer = io.BytesIO()
-    audio = waveform.to(dtype=torch.float32).cpu()
-    if audio.ndim == 1:
-        audio = audio.unsqueeze(0)
-    torchaudio.save(buffer, audio, sample_rate, format="flac", bits_per_sample=16)
     return {"__key__": key, "flac": buffer.getvalue()}

 from omnivoice.data.batching import StreamLengthGroupDataset
 from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+import soundfile as sf
 from omnivoice.utils.common import str2bool
 SIDON_INPUT_SAMPLE_RATE = 16_000
 def serialise_flac(key: str, waveform: torch.Tensor, sample_rate: int) -> dict:
     buffer = io.BytesIO()
+    audio = waveform.to(dtype=torch.float32).cpu().numpy()
+    if audio.ndim == 2:
+        audio = audio.T  # (C, T) → (T, C) for soundfile
+    sf.write(buffer, audio, sample_rate, format="FLAC")
     return {"__key__": key, "flac": buffer.getvalue()}

omnivoice/scripts/extract_audio_tokens_add_noise.py CHANGED Viewed

@@ -66,13 +66,13 @@ from typing import Any
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torchaudio
 import webdataset as wds
 from torch.utils.data import DataLoader, IterableDataset
 from tqdm.auto import tqdm
 from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel
 from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
 from omnivoice.utils.common import str2bool
 warnings.filterwarnings(
@@ -207,13 +207,7 @@ def serialise_numpy(key: str, tokens: np.ndarray) -> dict:
 def _load_aug_audio(data, sample_rate=24000):
     """Simple audio loader for augmentation files."""
-    with io.BytesIO(data) as b:
-        wav, sr = torchaudio.load(b)
-    if wav.shape[0] > 1:
-        wav = wav.mean(dim=0, keepdim=True)
-    if sr != sample_rate:
-        wav = torchaudio.functional.resample(wav, sr, sample_rate)
-    return wav
 class SimpleWorkerSampler:

 import numpy as np
 import torch
 import torch.nn.functional as F
 import webdataset as wds
 from torch.utils.data import DataLoader, IterableDataset
 from tqdm.auto import tqdm
 from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel
 from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+from omnivoice.utils.audio import load_audio_bytes
 from omnivoice.utils.common import str2bool
 warnings.filterwarnings(
 def _load_aug_audio(data, sample_rate=24000):
     """Simple audio loader for augmentation files."""
+    return torch.from_numpy(load_audio_bytes(data, sample_rate))
 class SimpleWorkerSampler:

omnivoice/scripts/jsonl_to_webdataset.py CHANGED Viewed

@@ -65,10 +65,13 @@ from concurrent.futures import (
 from itertools import islice
 from pathlib import Path
-import torchaudio
 import webdataset as wds
 from tqdm import tqdm
 from omnivoice.utils.common import str2bool
@@ -164,16 +167,16 @@ def process_audio_item(meta, target_sr):
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"{audio_path} not found")
-        waveform, sr = torchaudio.load(audio_path)
         audio_duration = waveform.shape[1] / sr
         meta["audio_duration"] = audio_duration
         if target_sr and sr != target_sr:
-            waveform = torchaudio.functional.resample(waveform, sr, target_sr)
             sr = target_sr
         audio_buffer = io.BytesIO()
-        torchaudio.save(audio_buffer, waveform, sr, format="flac", bits_per_sample=16)
         audio_bytes = audio_buffer.getvalue()
         sample = {

 from itertools import islice
 from pathlib import Path
+import librosa
 import webdataset as wds
 from tqdm import tqdm
+import soundfile as sf
+from omnivoice.utils.audio import load_waveform
 from omnivoice.utils.common import str2bool
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"{audio_path} not found")
+        waveform, sr = load_waveform(audio_path)
         audio_duration = waveform.shape[1] / sr
         meta["audio_duration"] = audio_duration
         if target_sr and sr != target_sr:
+            waveform = librosa.resample(waveform, orig_sr=sr, target_sr=target_sr)
             sr = target_sr
         audio_buffer = io.BytesIO()
+        sf.write(audio_buffer, waveform.T, sr, format="FLAC")
         audio_bytes = audio_buffer.getvalue()
         sample = {

omnivoice/utils/audio.py CHANGED Viewed

@@ -17,83 +17,157 @@
 """Audio I/O and processing utilities.
-Provides functions for loading, resampling, silence removal, chunking,
-cross-fading, and format conversion. Used by ``OmniVoice.generate()`` during
-inference post-processing.
 """
 import numpy as np
-import torch
-import torchaudio
 from pydub import AudioSegment
 from pydub.silence import detect_leading_silence, detect_nonsilent, split_on_silence
-def load_audio(audio_path: str, sampling_rate: int):
     """
-    Load the waveform with torchaudio and resampling if needed.
     Parameters:
         audio_path: path of the audio.
         sampling_rate: target sampling rate.
     Returns:
-        Loaded prompt waveform with target sampling rate,
-        PyTorch tensor of shape (1, T)
     """
-    try:
-        waveform, prompt_sampling_rate = torchaudio.load(
-            audio_path, backend="soundfile"
         )
-    except (RuntimeError, OSError):
-        # Fallback via pydub+ffmpeg for formats torchaudio can't handle
-        aseg = AudioSegment.from_file(audio_path)
-        audio_data = np.array(aseg.get_array_of_samples()).astype(np.float32) / 32768.0
-        if aseg.channels == 1:
-            waveform = torch.from_numpy(audio_data).unsqueeze(0)
-        else:
-            waveform = torch.from_numpy(audio_data.reshape(-1, aseg.channels).T)
-        prompt_sampling_rate = aseg.frame_rate
-    if prompt_sampling_rate != sampling_rate:
-        waveform = torchaudio.functional.resample(
-            waveform,
-            orig_freq=prompt_sampling_rate,
-            new_freq=sampling_rate,
         )
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    return waveform
 def remove_silence(
-    audio: torch.Tensor,
     sampling_rate: int,
     mid_sil: int = 300,
     lead_sil: int = 100,
     trail_sil: int = 300,
-):
-    """
-    Remove middle silences longer than mid_sil ms, and edge silences longer than edge_sil ms
     Parameters:
-        audio: PyTorch tensor with shape (C, T).
         sampling_rate: sampling rate of the audio.
-        mid_sil: the duration of silences in the middle of audio to be removed in ms.
-            if mid_sil <= 0, no middle silence will be removed.
-        edge_sil: the duration of silences in the edge of audio to be removed in ms.
-        trail_sil: the duration of added trailing silence in ms.
     Returns:
-        PyTorch tensor with shape (C, T), where C is number of channels
-            and T is number of audio samples
     """
-    # Load audio file
-    wave = tensor_to_audiosegment(audio, sampling_rate)
     if mid_sil > 0:
-        # Split audio using silences longer than mid_sil
         non_silent_segs = split_on_silence(
             wave,
             min_silence_len=mid_sil,
@@ -101,17 +175,13 @@ def remove_silence(
             keep_silence=mid_sil,
             seek_step=10,
         )
-        # Concatenate all non-silent segments
         wave = AudioSegment.silent(duration=0)
         for seg in non_silent_segs:
             wave += seg
-    # Remove silence longer than 0.1 seconds in the begining and ending of wave
     wave = remove_silence_edges(wave, lead_sil, trail_sil, -50)
-    # Convert to PyTorch tensor
-    return audiosegment_to_tensor(wave)
 def remove_silence_edges(
@@ -119,25 +189,12 @@ def remove_silence_edges(
     lead_sil: int = 100,
     trail_sil: int = 300,
     silence_threshold: float = -50,
-):
-    """
-    Remove edge silences longer than `keep_silence` ms.
-    Parameters:
-        audio: an AudioSegment object.
-        keep_silence: kept silence in the edge.
-        only_edge: If true, only remove edge silences.
-        silence_threshold: the threshold of silence.
-    Returns:
-        An AudioSegment object
-    """
-    # Remove heading silence
     start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
     start_idx = max(0, start_idx - lead_sil)
     audio = audio[start_idx:]
-    # Remove trailing silence
     audio = audio.reverse()
     start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
     start_idx = max(0, start_idx - trail_sil)
@@ -147,80 +204,22 @@ def remove_silence_edges(
     return audio
-def audiosegment_to_tensor(aseg):
-    """
-    Convert a pydub.AudioSegment to PyTorch audio tensor
-    """
-    audio_data = np.array(aseg.get_array_of_samples())
-    # Convert to float32 and normalize to [-1, 1] range
-    audio_data = audio_data.astype(np.float32) / 32768.0
-    # Handle channels
-    if aseg.channels == 1:
-        # Mono channel: add channel dimension (T) -> (1, T)
-        tensor_data = torch.from_numpy(audio_data).unsqueeze(0)
-    else:
-        # Multi-channel: reshape to (C, T)
-        tensor_data = torch.from_numpy(audio_data.reshape(-1, aseg.channels).T)
-    return tensor_data
-def tensor_to_audiosegment(tensor, sample_rate):
-    """
-    Convert a PyTorch audio tensor to pydub.AudioSegment
-    Parameters:
-        tensor: Tensor with shape (C, T), where C is the number of channels
-            and T is the time steps
-        sample_rate: Audio sample rate
-    """
-    # Convert tensor to numpy array
-    assert isinstance(tensor, torch.Tensor)
-    audio_np = tensor.cpu().numpy()
-    # Convert to int16 type (common format for pydub)
-    # Assumes tensor values are in [-1, 1] range as floating point
-    audio_np = (audio_np * 32768.0).clip(-32768, 32767).astype(np.int16)
-    # Convert to byte stream
-    # For multi-channel audio, pydub requires interleaved format
-    # (e.g., left-right-left-right)
-    if audio_np.shape[0] > 1:
-        # Convert to interleaved format
-        audio_np = audio_np.transpose(1, 0).flatten()
-    audio_bytes = audio_np.tobytes()
-    # Create AudioSegment
-    audio_segment = AudioSegment(
-        data=audio_bytes,
-        sample_width=2,
-        frame_rate=sample_rate,
-        channels=tensor.shape[0],
-    )
-    return audio_segment
 def fade_and_pad_audio(
-    audio: torch.Tensor,
     pad_duration: float = 0.1,
     fade_duration: float = 0.1,
     sample_rate: int = 24000,
-) -> torch.Tensor:
-    """
-    Applies a smooth fade-in and fade-out to the audio, and then pads both sides
-    with pure silence to prevent abrupt starts and ends (clicks/pops).
     Args:
-        audio: PyTorch tensor of shape (C, T) containing audio data.
-        pad_duration: Duration of pure silence to add to each end (in seconds).
-        fade_duration: Duration of the fade-in/out curve (in seconds).
-        sample_rate: Audio sampling rate.
     Returns:
-        Processed sequence tensor with shape (C, T_new)
     """
     if audio.shape[-1] == 0:
         return audio
@@ -228,59 +227,53 @@ def fade_and_pad_audio(
     fade_samples = int(fade_duration * sample_rate)
     pad_samples = int(pad_duration * sample_rate)
-    processed = audio.clone()
     if fade_samples > 0:
         k = min(fade_samples, processed.shape[-1] // 2)
         if k > 0:
-            fade_in = torch.linspace(
-                0, 1, k, device=processed.device, dtype=processed.dtype
-            )[None, :]
-            processed[..., :k] = processed[..., :k] * fade_in
-            fade_out = torch.linspace(
-                1, 0, k, device=processed.device, dtype=processed.dtype
-            )[None, :]
-            processed[..., -k:] = processed[..., -k:] * fade_out
     if pad_samples > 0:
-        silence = torch.zeros(
             (processed.shape[0], pad_samples),
             dtype=processed.dtype,
-            device=processed.device,
         )
-        processed = torch.cat([silence, processed, silence], dim=-1)
     return processed
 def trim_long_audio(
-    audio: torch.Tensor,
     sampling_rate: int,
     max_duration: float = 15.0,
     min_duration: float = 3.0,
     trim_threshold: float = 20.0,
-) -> torch.Tensor:
-    """Trim audio to <= max_duration by splitting at the largest silence gap.
     Only trims when the audio exceeds *trim_threshold* seconds.
     Args:
-        audio: Audio tensor of shape (C, T).
-        sampling_rate: Audio sampling rate.
-        max_duration: Maximum duration in seconds.
-        min_duration: Minimum duration in seconds.
-        trim_threshold: Only trim if audio is longer than this (seconds).
     Returns:
-        Trimmed audio tensor.
     """
-    duration = audio.size(-1) / sampling_rate
     if duration <= trim_threshold:
         return audio
-    seg = tensor_to_audiosegment(audio, sampling_rate)
     nonsilent = detect_nonsilent(
         seg, min_silence_len=100, silence_thresh=-40, seek_step=10
     )
@@ -290,7 +283,6 @@ def trim_long_audio(
     max_ms = int(max_duration * 1000)
     min_ms = int(min_duration * 1000)
-    # Walk through speech regions; at each gap pick the latest split <= max_duration
     best_split = 0
     for start, end in nonsilent:
         if start > best_split and start <= max_ms:
@@ -302,56 +294,49 @@ def trim_long_audio(
         best_split = min(max_ms, len(seg))
     trimmed = seg[:best_split]
-    return audiosegment_to_tensor(trimmed)
 def cross_fade_chunks(
-    chunks: list[torch.Tensor],
     sample_rate: int,
     silence_duration: float = 0.3,
-) -> torch.Tensor:
-    """Concatenate audio chunks with a short silence gap and fade at boundaries.
-    Each boundary is structured as: fade-out tail → silence buffer → fade-in head.
-    This avoids click artifacts from direct concatenation or overlapping mismatch.
     Args:
-        chunks: List of audio tensors, each (C, T).
-        sample_rate: Audio sample rate.
-        silence_duration: Total silence gap duration in seconds.
     Returns:
-        Merged audio tensor (C, T_total).
     """
     if len(chunks) == 1:
         return chunks[0]
     total_n = int(silence_duration * sample_rate)
     fade_n = total_n // 3
-    silence_n = fade_n  # middle silent gap
-    merged = chunks[0].clone()
     for chunk in chunks[1:]:
-        dev, dt = merged.device, merged.dtype
         parts = [merged]
-        # Fade out tail of current merged audio
-        fout_n = min(fade_n, merged.size(-1))
         if fout_n > 0:
-            w_out = torch.linspace(1, 0, fout_n, device=dev, dtype=dt)[None, :]
-            parts[-1][..., -fout_n:] = parts[-1][..., -fout_n:] * w_out
-        # Silent buffer between chunks
-        parts.append(torch.zeros(chunks[0].shape[0], silence_n, device=dev, dtype=dt))
-        # Fade in head of next chunk
-        fade_in = chunk.clone()
-        fin_n = min(fade_n, fade_in.size(-1))
         if fin_n > 0:
-            w_in = torch.linspace(0, 1, fin_n, device=dev, dtype=dt)[None, :]
-            fade_in[..., :fin_n] = fade_in[..., :fin_n] * w_in
         parts.append(fade_in)
-        merged = torch.cat(parts, dim=-1)
     return merged

 """Audio I/O and processing utilities.
+Provides functions for loading, resampling, silence removal,
+chunking, cross-fading, and format conversion.
+All public functions in this module operate on **numpy float32 arrays**
+with shape ``(C, T)`` (channels-first).
 """
+import io
+import logging
+import librosa
 import numpy as np
+import soundfile as sf
 from pydub import AudioSegment
 from pydub.silence import detect_leading_silence, detect_nonsilent, split_on_silence
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Loading
+# ---------------------------------------------------------------------------
+def load_waveform(audio_path: str):
+    """Load audio from a file path, returning (data, sample_rate).
+    Tries two backends in order:
+    1. soundfile — covers WAV/FLAC/OGG etc., no ffmpeg needed.
+    2. librosa — covers MP3/M4A etc. via audioread + ffmpeg.
+    Returns:
+        (data, sample_rate) where data is a numpy float32 array of
+        shape (C, T).
     """
+    try:
+        data, sr = sf.read(audio_path, dtype="float32", always_2d=True)
+        return data.T, sr  # (T, C) → (C, T)
+    except Exception:
+        # soundfile cannot handle MP3/M4A etc., fall back to librosa.
+        data, sr = librosa.load(audio_path, sr=None, mono=False)
+        if data.ndim == 1:
+            data = data[np.newaxis, :]
+        return data, sr
+def load_audio(audio_path: str, sampling_rate: int) -> np.ndarray:
+    """Load a waveform from file and resample to the target rate.
     Parameters:
         audio_path: path of the audio.
         sampling_rate: target sampling rate.
     Returns:
+        Numpy float32 array of shape (1, T).
     """
+    data, sr = load_waveform(audio_path)
+    if data.shape[0] > 1:
+        data = np.mean(data, axis=0, keepdims=True)
+    if sr != sampling_rate:
+        data = librosa.resample(
+            data,
+            orig_sr=sr,
+            target_sr=sampling_rate,
         )
+    return data
+def load_audio_bytes(raw: bytes, sampling_rate: int) -> np.ndarray:
+    """Load audio from in-memory bytes and resample.
+    Parameters:
+        raw: raw audio file bytes (e.g. from WebDataset).
+        sampling_rate: target sampling rate.
+    Returns:
+        Numpy float32 array of shape (1, T).
+    """
+    buf = io.BytesIO(raw)
+    try:
+        data, sr = sf.read(buf, dtype="float32", always_2d=True)
+        data = data.T  # (T, C) → (C, T)
+    except Exception:
+        buf.seek(0)
+        data, sr = librosa.load(buf, sr=None, mono=False)
+        if data.ndim == 1:
+            data = data[np.newaxis, :]
+    if data.shape[0] > 1:
+        data = np.mean(data, axis=0, keepdims=True)
+    if sr != sampling_rate:
+        data = librosa.resample(
+            data,
+            orig_sr=sr,
+            target_sr=sampling_rate,
         )
+    return data
+# ---------------------------------------------------------------------------
+# Audio processing (all numpy in / numpy out)
+# ---------------------------------------------------------------------------
+def numpy_to_audiosegment(audio: np.ndarray, sample_rate: int) -> AudioSegment:
+    """Convert a numpy float32 array of shape (C, T) to a pydub AudioSegment."""
+    audio_int = (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
+    if audio_int.shape[0] > 1:
+        audio_int = audio_int.T.flatten()  # interleave channels
+    return AudioSegment(
+        data=audio_int.tobytes(),
+        sample_width=2,
+        frame_rate=sample_rate,
+        channels=audio.shape[0],
+    )
+def audiosegment_to_numpy(aseg: AudioSegment) -> np.ndarray:
+    """Convert a pydub AudioSegment to a numpy float32 array of shape (C, T)."""
+    data = np.array(aseg.get_array_of_samples()).astype(np.float32) / 32768.0
+    if aseg.channels == 1:
+        return data[np.newaxis, :]
+    return data.reshape(-1, aseg.channels).T
 def remove_silence(
+    audio: np.ndarray,
     sampling_rate: int,
     mid_sil: int = 300,
     lead_sil: int = 100,
     trail_sil: int = 300,
+) -> np.ndarray:
+    """Remove middle silences longer than *mid_sil* ms and trim edge silences.
     Parameters:
+        audio: numpy array with shape (C, T).
         sampling_rate: sampling rate of the audio.
+        mid_sil: middle-silence threshold in ms (0 to skip).
+        lead_sil: kept leading silence in ms.
+        trail_sil: kept trailing silence in ms.
     Returns:
+        Numpy array with shape (C, T').
     """
+    wave = numpy_to_audiosegment(audio, sampling_rate)
     if mid_sil > 0:
         non_silent_segs = split_on_silence(
             wave,
             min_silence_len=mid_sil,
             keep_silence=mid_sil,
             seek_step=10,
         )
         wave = AudioSegment.silent(duration=0)
         for seg in non_silent_segs:
             wave += seg
     wave = remove_silence_edges(wave, lead_sil, trail_sil, -50)
+    return audiosegment_to_numpy(wave)
 def remove_silence_edges(
     lead_sil: int = 100,
     trail_sil: int = 300,
     silence_threshold: float = -50,
+) -> AudioSegment:
+    """Remove edge silences, keeping *lead_sil* / *trail_sil* ms."""
     start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
     start_idx = max(0, start_idx - lead_sil)
     audio = audio[start_idx:]
     audio = audio.reverse()
     start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
     start_idx = max(0, start_idx - trail_sil)
     return audio
 def fade_and_pad_audio(
+    audio: np.ndarray,
     pad_duration: float = 0.1,
     fade_duration: float = 0.1,
     sample_rate: int = 24000,
+) -> np.ndarray:
+    """Apply fade-in/out and pad with silence to prevent clicks.
     Args:
+        audio: numpy array of shape (C, T).
+        pad_duration: silence padding duration per side (seconds).
+        fade_duration: fade curve duration (seconds).
+        sample_rate: audio sampling rate.
     Returns:
+        Processed numpy array of shape (C, T_new).
     """
     if audio.shape[-1] == 0:
         return audio
     fade_samples = int(fade_duration * sample_rate)
     pad_samples = int(pad_duration * sample_rate)
+    processed = audio.copy()
     if fade_samples > 0:
         k = min(fade_samples, processed.shape[-1] // 2)
         if k > 0:
+            fade_in = np.linspace(0, 1, k, dtype=np.float32)[np.newaxis, :]
+            processed[..., :k] *= fade_in
+            fade_out = np.linspace(1, 0, k, dtype=np.float32)[np.newaxis, :]
+            processed[..., -k:] *= fade_out
     if pad_samples > 0:
+        silence = np.zeros(
             (processed.shape[0], pad_samples),
             dtype=processed.dtype,
         )
+        processed = np.concatenate([silence, processed, silence], axis=-1)
     return processed
 def trim_long_audio(
+    audio: np.ndarray,
     sampling_rate: int,
     max_duration: float = 15.0,
     min_duration: float = 3.0,
     trim_threshold: float = 20.0,
+) -> np.ndarray:
+    """Trim audio to <= *max_duration* by splitting at the largest silence gap.
     Only trims when the audio exceeds *trim_threshold* seconds.
     Args:
+        audio: numpy array of shape (C, T).
+        sampling_rate: audio sampling rate.
+        max_duration: maximum duration in seconds.
+        min_duration: minimum duration in seconds.
+        trim_threshold: only trim if audio is longer than this (seconds).
     Returns:
+        Trimmed numpy array.
     """
+    duration = audio.shape[-1] / sampling_rate
     if duration <= trim_threshold:
         return audio
+    seg = numpy_to_audiosegment(audio, sampling_rate)
     nonsilent = detect_nonsilent(
         seg, min_silence_len=100, silence_thresh=-40, seek_step=10
     )
     max_ms = int(max_duration * 1000)
     min_ms = int(min_duration * 1000)
     best_split = 0
     for start, end in nonsilent:
         if start > best_split and start <= max_ms:
         best_split = min(max_ms, len(seg))
     trimmed = seg[:best_split]
+    return audiosegment_to_numpy(trimmed)
 def cross_fade_chunks(
+    chunks: list[np.ndarray],
     sample_rate: int,
     silence_duration: float = 0.3,
+) -> np.ndarray:
+    """Concatenate audio chunks with silence gaps and cross-fade at boundaries.
     Args:
+        chunks: list of numpy arrays, each (C, T).
+        sample_rate: audio sample rate.
+        silence_duration: total silence gap duration in seconds.
     Returns:
+        Merged numpy array (C, T_total).
     """
     if len(chunks) == 1:
         return chunks[0]
     total_n = int(silence_duration * sample_rate)
     fade_n = total_n // 3
+    silence_n = fade_n
+    merged = chunks[0].copy()
     for chunk in chunks[1:]:
         parts = [merged]
+        fout_n = min(fade_n, merged.shape[-1])
         if fout_n > 0:
+            w_out = np.linspace(1, 0, fout_n, dtype=np.float32)[np.newaxis, :]
+            parts[-1][..., -fout_n:] *= w_out
+        parts.append(np.zeros((chunks[0].shape[0], silence_n), dtype=np.float32))
+        fade_in = chunk.copy()
+        fin_n = min(fade_n, fade_in.shape[-1])
         if fin_n > 0:
+            w_in = np.linspace(0, 1, fin_n, dtype=np.float32)[np.newaxis, :]
+            fade_in[..., :fin_n] *= w_in
         parts.append(fade_in)
+        merged = np.concatenate(parts, axis=-1)
     return merged

omnivoice/utils/data_utils.py CHANGED Viewed

@@ -29,10 +29,10 @@ from pathlib import Path
 def read_test_list(path):
     """Read a JSONL test list file.
-    Each line should be a JSON object with fields:
-        id, text, ref_audio, ref_text, language_id, language_name, duration, speed
-    language_id, language_name, duration, and speed are optional (default to None).
     Returns a list of dicts.
     """
@@ -58,6 +58,7 @@ def read_test_list(path):
                 "language_name": obj.get("language_name"),
                 "duration": obj.get("duration"),
                 "speed": obj.get("speed"),
             }
             samples.append(sample)
     return samples

 def read_test_list(path):
     """Read a JSONL test list file.
+    Each line should be a JSON object.  Only ``id`` and ``text`` are required;
+    all other fields are optional (default to ``None``):
+        id, text, ref_audio, ref_text, instruct,
+        language_id, language_name, duration, speed
     Returns a list of dicts.
     """
                 "language_name": obj.get("language_name"),
                 "duration": obj.get("duration"),
                 "speed": obj.get("speed"),
+                "instruct": obj.get("instruct"),
             }
             samples.append(sample)
     return samples

requirements.txt CHANGED Viewed

@@ -5,6 +5,7 @@ transformers==5.3
 accelerate
 pydub
 soundfile
 numpy
 gradio
 hf_transfer

 accelerate
 pydub
 soundfile
+librosa
 numpy
 gradio
 hf_transfer