mirror of
https://github.com/ace-step/ACE-Step-1.5.git
synced 2026-07-02 16:37:04 +00:00
fix(flow-edit): skip LM Phase 1 for text2music + morph (#1156)
Earlier fix only zeroed ``audio_code_string_to_use`` at the top of ``generate_music``. But when Think is on (UI default), LM Phase 1 runs anyway and overwrites ``audio_code_string_to_use`` with freshly-generated codes — the same codes path then bites ``conditioning_target`` and zt_edit starts OOD again. Add ``text2music + flow_edit_morph`` to the LM-skip path alongside cover / cover-nofsq / repaint / extract. Think / CoT both silently no-op for the morph case so the downstream pipeline VAE-encodes the src_audio cleanly regardless of whether Think was checked in the UI. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4028a006fd
commit
80f8c9c6b5
1 changed files with 19 additions and 5 deletions
|
|
@ -458,7 +458,19 @@ def generate_music(
|
|||
# For extract tasks, LLM-generated captions can conflict with the extract instruction
|
||||
# and cause the DiT model to reconstruct input audio instead of extracting stems.
|
||||
skip_lm_tasks = {"cover", "cover-nofsq", "repaint", "extract"}
|
||||
|
||||
# Flow-edit overlay on text2music must NOT trigger LM Phase 1.
|
||||
# Even if Think is on, the LM-generated codes would be routed
|
||||
# into ``conditioning_target`` which replaces target_wavs with
|
||||
# zeros and uses ``_decode_audio_codes_to_latents(codes)`` for
|
||||
# target_latents — flow-edit's ``zt_edit = src_latents.clone()``
|
||||
# then starts at a codes-decoded latent (different distribution
|
||||
# than VAE encode) and the V_delta integration collapses to a
|
||||
# near-silent latent. Treat morph-on-text2music like a skip
|
||||
# task so Think / CoT both no-op.
|
||||
morph_on_text2music = (
|
||||
params.task_type == "text2music" and params.flow_edit_morph
|
||||
)
|
||||
|
||||
# Determine if we should use LLM
|
||||
# LLM is needed for:
|
||||
# 1. thinking=True: generate audio codes via LM
|
||||
|
|
@ -466,11 +478,13 @@ def generate_music(
|
|||
# 3. use_cot_language=True: detect vocal language via CoT
|
||||
# 4. use_cot_metas=True: fill missing metadata via CoT
|
||||
need_lm_for_cot = params.use_cot_caption or params.use_cot_language or params.use_cot_metas
|
||||
use_lm = (params.thinking or need_lm_for_cot) and llm_handler is not None and llm_handler.llm_initialized and params.task_type not in skip_lm_tasks
|
||||
skip_lm = params.task_type in skip_lm_tasks or morph_on_text2music
|
||||
use_lm = (params.thinking or need_lm_for_cot) and llm_handler is not None and llm_handler.llm_initialized and not skip_lm
|
||||
lm_status = []
|
||||
|
||||
if params.task_type in skip_lm_tasks:
|
||||
logger.info(f"Skipping LM for task_type='{params.task_type}' - using DiT directly")
|
||||
|
||||
if skip_lm:
|
||||
reason = params.task_type if params.task_type in skip_lm_tasks else f"{params.task_type}+flow_edit_morph"
|
||||
logger.info(f"Skipping LM for task_type='{reason}' - using DiT directly")
|
||||
|
||||
logger.info(f"[generate_music] LLM usage decision: thinking={params.thinking}, "
|
||||
f"use_cot_caption={params.use_cot_caption}, use_cot_language={params.use_cot_language}, "
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue