From 7cbc6c697c20d396491ff8902429bccf71579ffd Mon Sep 17 00:00:00 2001
From: Song367 <601337784@qq.com>
Date: Thu, 19 Mar 2026 22:56:53 +0800
Subject: [PATCH] =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=8F=90=E7=A4=BA=E8=AF=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/server/videoSubtitleGeneration.test.ts |  3 +++
 src/server/videoSubtitleGeneration.ts      | 27 ++++++++++++++++++----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/server/videoSubtitleGeneration.test.ts b/src/server/videoSubtitleGeneration.test.ts
index 7dc2b87..e2e339b 100644
--- a/src/server/videoSubtitleGeneration.test.ts
+++ b/src/server/videoSubtitleGeneration.test.ts
@@ -124,6 +124,9 @@ describe('generateSubtitlesFromVideo', () => {
     expect(payload.input[0].content[0].text).toContain('"ttsText":');
     expect(payload.input[0].content[0].text).toContain('"ttsLanguage":');
     expect(payload.input[0].content[0].text).toContain('translatedText must always be English');
+    expect(payload.input[0].content[0].text).toContain('originalText must be a faithful transcription of the actually audible speech');
+    expect(payload.input[0].content[0].text).toContain('Do not rewrite, summarize, polish, correct, or paraphrase');
+    expect(payload.input[0].content[0].text).toContain('Do not infer hidden dialogue from context, visuals, plot, or likely meaning');
 
     expect(payload.input[1].role).toBe('user');
     expect(payload.input[1].content[0]).toEqual({
diff --git a/src/server/videoSubtitleGeneration.ts b/src/server/videoSubtitleGeneration.ts
index 38ef7fc..7fa134b 100644
--- a/src/server/videoSubtitleGeneration.ts
+++ b/src/server/videoSubtitleGeneration.ts
@@ -185,20 +185,35 @@ The duration of a single subtitle item should usually not exceed 3 to 5 seconds.
 Accurately identify the speaker label and speaker gender.
 Gender must be either "male" or "female".
 
-5. Translation Rules:
+5. Transcription Fidelity Rules:
+- originalText must be a faithful transcription of the actually audible speech.
+- Do not rewrite, summarize, polish, correct, or paraphrase the spoken content in originalText.
+- Do not add words that are not clearly supported by the audio.
+- Do not infer hidden dialogue from context, visuals, plot, or likely meaning.
+- If a word or short phrase is unclear, keep the transcription conservative and only include what is reasonably audible.
+- translatedText and ttsText must be derived from originalText, not invented independently.
+- Never let translation quality override transcription fidelity.
+
+6. Translation Rules:
 - translatedText must always be English subtitle text for on-screen display.
 - ttsText must be translated into the user-provided TTS language.
 - translatedText and ttsText must preserve the same meaning as the original speech.
 - translatedText should prioritize subtitle readability.
 - ttsText should prioritize natural spoken dubbing in the target TTS language.
 
-6. Voice Selection:
+7. Voice Selection:
 The user will provide a TTS language and a list of available voices.
 Each voice includes a voiceId and descriptive metadata.
 You must analyze the user-provided voice list and choose the best matching voiceId for each subtitle item.
 Only return a voiceId that exists in the user-provided voice list.
 Do not invent new voiceId values.
 
+Priority order:
+1. Audio-faithful transcription accuracy
+2. Timestamp accuracy
+3. Translation quality
+4. Voice matching
+
 # Output Contract
 You must return exactly one JSON object.
 The first character of your response must be {.
@@ -239,9 +254,11 @@ Return a JSON object with this exact top-level structure:
 12. Use video timeline seconds for startTime and endTime.
 13. Keep subtitles chronological and non-overlapping.
 14. Do not invent dialogue if it is not actually audible.
-15. Preserve meaning naturally while keeping subtitle lines short and readable.
-16. If a long utterance must be split, preserve continuity across consecutive subtitle items.
-17. Output JSON only.`;
+15. originalText must be a faithful transcription of what is actually spoken in the source audio.
+16. Do not rewrite, polish, summarize, or infer originalText from context.
+17. Preserve meaning naturally while keeping subtitle lines short and readable.
+18. If a long utterance must be split, preserve continuity across consecutive subtitle items.
+19. Output JSON only.`;
 
 const createUserPrompt = (ttsLanguage: string) => `Subtitle language: English
 TTS language: ${ttsLanguage}