fix(flow-edit): skip LM Phase 1 for text2music + morph (#1156)

Earlier fix only zeroed ``audio_code_string_to_use`` at the top of ``generate_music``. But when Think is on (UI default), LM Phase 1 runs anyway and overwrites ``audio_code_string_to_use`` with freshly-generated codes — the same codes path then bites ``conditioning_target`` and zt_edit starts OOD again. Add ``text2music + flow_edit_morph`` to the LM-skip path alongside cover / cover-nofsq / repaint / extract. Think / CoT both silently no-op for the morph case so the downstream pipeline VAE-encodes the src_audio cleanly regardless of whether Think was checked in the UI. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-02 16:37:04 +00:00 · 2026-04-30 18:45:40 +08:00 · 2026-04-30 18:45:40 +08:00 · 80f8c9c6b5
commit 80f8c9c6b5
parent 4028a006fd
1 changed files with 19 additions and 5 deletions
--- a/acestep/inference.py
+++ b/acestep/inference.py
@ -458,7 +458,19 @@ def generate_music(
        # For extract tasks, LLM-generated captions can conflict with the extract instruction
        # and cause the DiT model to reconstruct input audio instead of extracting stems.
        skip_lm_tasks = {"cover", "cover-nofsq", "repaint", "extract"}
-        
+        # Flow-edit overlay on text2music must NOT trigger LM Phase 1.
+        # Even if Think is on, the LM-generated codes would be routed
+        # into ``conditioning_target`` which replaces target_wavs with
+        # zeros and uses ``_decode_audio_codes_to_latents(codes)`` for
+        # target_latents — flow-edit's ``zt_edit = src_latents.clone()``
+        # then starts at a codes-decoded latent (different distribution
+        # than VAE encode) and the V_delta integration collapses to a
+        # near-silent latent.  Treat morph-on-text2music like a skip
+        # task so Think / CoT both no-op.
+        morph_on_text2music = (
+            params.task_type == "text2music" and params.flow_edit_morph
+        )
+
        # Determine if we should use LLM
        # LLM is needed for:
        # 1. thinking=True: generate audio codes via LM
@ -466,11 +478,13 @@ def generate_music(
        # 3. use_cot_language=True: detect vocal language via CoT
        # 4. use_cot_metas=True: fill missing metadata via CoT
        need_lm_for_cot = params.use_cot_caption or params.use_cot_language or params.use_cot_metas
-        use_lm = (params.thinking or need_lm_for_cot) and llm_handler is not None and llm_handler.llm_initialized and params.task_type not in skip_lm_tasks
+        skip_lm = params.task_type in skip_lm_tasks or morph_on_text2music
+        use_lm = (params.thinking or need_lm_for_cot) and llm_handler is not None and llm_handler.llm_initialized and not skip_lm
        lm_status = []
-        
-        if params.task_type in skip_lm_tasks:
-            logger.info(f"Skipping LM for task_type='{params.task_type}' - using DiT directly")
+
+        if skip_lm:
+            reason = params.task_type if params.task_type in skip_lm_tasks else f"{params.task_type}+flow_edit_morph"
+            logger.info(f"Skipping LM for task_type='{reason}' - using DiT directly")
        
        logger.info(f"[generate_music] LLM usage decision: thinking={params.thinking}, "
                   f"use_cot_caption={params.use_cot_caption}, use_cot_language={params.use_cot_language}, "