video_translate/src/server/videoSubtitleGeneration.test.ts

import fs from 'fs';
import { afterEach, describe, expect, it, vi } from 'vitest';
import { generateSubtitlesFromVideo } from './videoSubtitleGeneration';

describe('generateSubtitlesFromVideo', () => {
  afterEach(() => {
    vi.restoreAllMocks();
  });

  it('passes the configured doubao timeout to fetch', async () => {
    vi.spyOn(fs, 'readFileSync').mockReturnValue(Buffer.from('video-bytes'));
    const fetchImpl = vi.fn<typeof fetch>(async () =>
      new Response(
        JSON.stringify({
          output: [
            {
              content: [
                {
                  text: JSON.stringify({
                    sourceLanguage: 'zh',
                    subtitles: [
                      {
                        originalText: 'hello there',
                        translatedText: 'Hello',
                        ttsText: 'Bonjour',
                        ttsLanguage: 'fr',
                        startTime: 0,
                        endTime: 1,
                      },
                    ],
                  }),
                },
              ],
            },
          ],
        }),
        {
          status: 200,
          headers: { 'Content-Type': 'application/json' },
        },
      ),
    );

    await generateSubtitlesFromVideo({
      providerConfig: {
        provider: 'doubao',
        apiKey: 'ark-key',
        model: 'doubao-seed-2-0-pro-260215',
        baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
        timeoutMs: 600000,
      },
      videoPath: 'clip.mp4',
      targetLanguage: 'English',
      ttsLanguage: 'fr',
      fetchImpl,
    } as any);

    expect(fetchImpl).toHaveBeenCalledWith(
      'https://ark.cn-beijing.volces.com/api/v3/responses',
      expect.objectContaining({
        method: 'POST',
        signal: expect.any(AbortSignal),
      }),
    );
  });

  it('uses ark file ids for doubao requests when available', async () => {
    const fetchImpl = vi.fn<typeof fetch>(async () =>
      new Response(
        JSON.stringify({
          output: [
            {
              content: [
                {
                  text: JSON.stringify({
                    sourceLanguage: 'zh',
                    subtitles: [
                      {
                        originalText: 'hello there',
                        translatedText: 'Hello',
                        ttsText: 'Bonjour',
                        ttsLanguage: 'fr',
                        startTime: 0,
                        endTime: 1,
                      },
                    ],
                  }),
                },
              ],
            },
          ],
        }),
        {
          status: 200,
          headers: { 'Content-Type': 'application/json' },
        },
      ),
    );

    await generateSubtitlesFromVideo({
      providerConfig: {
        provider: 'doubao',
        apiKey: 'ark-key',
        model: 'doubao-seed-2-0-pro-260215',
        baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
        timeoutMs: 600000,
      },
      fileId: 'file-123',
      targetLanguage: 'English',
      ttsLanguage: 'fr',
      fetchImpl,
    } as any);

    const [, request] = fetchImpl.mock.calls[0] as [string, RequestInit];
    const payload = JSON.parse(String(request.body));

    expect(payload.input[0].role).toBe('system');
    expect(payload.input[0].content[0].type).toBe('input_text');
    expect(payload.input[0].content[0].text).toContain('# Role');
    expect(payload.input[0].content[0].text).toContain('Voice Selection');
    expect(payload.input[0].content[0].text).toContain('# Output Contract');
    expect(payload.input[0].content[0].text).toContain('The first character of your response must be {.');
    expect(payload.input[0].content[0].text).toContain('The last character of your response must be }.');
    expect(payload.input[0].content[0].text).toContain('"ttsText":');
    expect(payload.input[0].content[0].text).toContain('"ttsLanguage":');
    expect(payload.input[0].content[0].text).toContain('translatedText must always be English');
    expect(payload.input[0].content[0].text).toContain('originalText must be a faithful transcription of the actually audible speech');
    expect(payload.input[0].content[0].text).toContain('Do not rewrite, summarize, polish, correct, or paraphrase');
    expect(payload.input[0].content[0].text).toContain('Do not infer hidden dialogue from context, visuals, plot, or likely meaning');

    expect(payload.input[1].role).toBe('user');
    expect(payload.input[1].content[0]).toEqual({
      type: 'input_video',
      file_id: 'file-123',
    });
    expect(payload.input[1].content[1].type).toBe('input_text');
    expect(payload.input[1].content[1].text).toContain('Subtitle language: English');
    expect(payload.input[1].content[1].text).toContain('TTS language: fr');
    expect(payload.input[1].content[1].text).toContain('Available voices for the TTS language');
  });

  it('extracts tts fields when doubao returns prose around a fenced payload', async () => {
    const fetchImpl = vi.fn<typeof fetch>(async () =>
      new Response(
        JSON.stringify({
          output: [
            {
              content: [
                {
                  text: 'Here is the JSON result:\n```json\n{"sourceLanguage":"zh","subtitles":[{"originalText":"hello there","translatedText":"Hello","ttsText":"Bonjour","ttsLanguage":"fr","startTime":0,"endTime":1,"speaker":"Speaker 1","voiceId":"male-qn-qingse"}]}\n```\nUse it directly.',
                },
              ],
            },
          ],
        }),
        {
          status: 200,
          headers: { 'Content-Type': 'application/json' },
        },
      ),
    );

    const result = await generateSubtitlesFromVideo({
      providerConfig: {
        provider: 'doubao',
        apiKey: 'ark-key',
        model: 'doubao-seed-2-0-pro-260215',
        baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
        timeoutMs: 600000,
      },
      fileId: 'file-123',
      targetLanguage: 'English',
      ttsLanguage: 'fr',
      fetchImpl,
    } as any);

    expect(result.sourceLanguage).toBe('zh');
    expect(result.subtitles).toHaveLength(1);
    expect(result.subtitles[0]).toMatchObject({
      originalText: 'hello there',
      translatedText: 'Hello',
      ttsText: 'Bonjour',
      ttsLanguage: 'fr',
      speaker: 'Speaker 1',
      voiceId: 'male-qn-qingse',
    });
  });
});