From 7cbc6c697c20d396491ff8902429bccf71579ffd Mon Sep 17 00:00:00 2001 From: Song367 <601337784@qq.com> Date: Thu, 19 Mar 2026 22:56:53 +0800 Subject: [PATCH] =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=8F=90=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/server/videoSubtitleGeneration.test.ts | 3 +++ src/server/videoSubtitleGeneration.ts | 27 ++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/server/videoSubtitleGeneration.test.ts b/src/server/videoSubtitleGeneration.test.ts index 7dc2b87..e2e339b 100644 --- a/src/server/videoSubtitleGeneration.test.ts +++ b/src/server/videoSubtitleGeneration.test.ts @@ -124,6 +124,9 @@ describe('generateSubtitlesFromVideo', () => { expect(payload.input[0].content[0].text).toContain('"ttsText":'); expect(payload.input[0].content[0].text).toContain('"ttsLanguage":'); expect(payload.input[0].content[0].text).toContain('translatedText must always be English'); + expect(payload.input[0].content[0].text).toContain('originalText must be a faithful transcription of the actually audible speech'); + expect(payload.input[0].content[0].text).toContain('Do not rewrite, summarize, polish, correct, or paraphrase'); + expect(payload.input[0].content[0].text).toContain('Do not infer hidden dialogue from context, visuals, plot, or likely meaning'); expect(payload.input[1].role).toBe('user'); expect(payload.input[1].content[0]).toEqual({ diff --git a/src/server/videoSubtitleGeneration.ts b/src/server/videoSubtitleGeneration.ts index 38ef7fc..7fa134b 100644 --- a/src/server/videoSubtitleGeneration.ts +++ b/src/server/videoSubtitleGeneration.ts @@ -185,20 +185,35 @@ The duration of a single subtitle item should usually not exceed 3 to 5 seconds. Accurately identify the speaker label and speaker gender. Gender must be either "male" or "female". -5. Translation Rules: +5. Transcription Fidelity Rules: +- originalText must be a faithful transcription of the actually audible speech. +- Do not rewrite, summarize, polish, correct, or paraphrase the spoken content in originalText. +- Do not add words that are not clearly supported by the audio. +- Do not infer hidden dialogue from context, visuals, plot, or likely meaning. +- If a word or short phrase is unclear, keep the transcription conservative and only include what is reasonably audible. +- translatedText and ttsText must be derived from originalText, not invented independently. +- Never let translation quality override transcription fidelity. + +6. Translation Rules: - translatedText must always be English subtitle text for on-screen display. - ttsText must be translated into the user-provided TTS language. - translatedText and ttsText must preserve the same meaning as the original speech. - translatedText should prioritize subtitle readability. - ttsText should prioritize natural spoken dubbing in the target TTS language. -6. Voice Selection: +7. Voice Selection: The user will provide a TTS language and a list of available voices. Each voice includes a voiceId and descriptive metadata. You must analyze the user-provided voice list and choose the best matching voiceId for each subtitle item. Only return a voiceId that exists in the user-provided voice list. Do not invent new voiceId values. +Priority order: +1. Audio-faithful transcription accuracy +2. Timestamp accuracy +3. Translation quality +4. Voice matching + # Output Contract You must return exactly one JSON object. The first character of your response must be {. @@ -239,9 +254,11 @@ Return a JSON object with this exact top-level structure: 12. Use video timeline seconds for startTime and endTime. 13. Keep subtitles chronological and non-overlapping. 14. Do not invent dialogue if it is not actually audible. -15. Preserve meaning naturally while keeping subtitle lines short and readable. -16. If a long utterance must be split, preserve continuity across consecutive subtitle items. -17. Output JSON only.`; +15. originalText must be a faithful transcription of what is actually spoken in the source audio. +16. Do not rewrite, polish, summarize, or infer originalText from context. +17. Preserve meaning naturally while keeping subtitle lines short and readable. +18. If a long utterance must be split, preserve continuity across consecutive subtitle items. +19. Output JSON only.`; const createUserPrompt = (ttsLanguage: string) => `Subtitle language: English TTS language: ${ttsLanguage}