文件上传

2026-03-19 11:17:10 +08:00 · 2026-03-19 11:17:10 +08:00 · a0c1dc6ad5
commit a0c1dc6ad5
parent 9ddcdc9ec6
24 changed files with 1316 additions and 61 deletions
--- a/.env
+++ b/.env
@ -2,4 +2,6 @@ GEMINI_API_KEY="AIzaSyAex0MkGj_X-h3L38334xVdZsFzOcU9cC0"
 ARK_API_KEY="e96194a9-8eda-4a90-a211-6db288045bdc"
 MINIMAX_API_KEY="eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiLkuIrmtbfpopzpgJTnp5HmioDmnInpmZDlhazlj7giLCJVc2VyTmFtZSI6IuadqOmqpSIsIkFjY291bnQiOiIiLCJTdWJqZWN0SUQiOiIxNzI4NzEyMzI0OTc5NjI2ODM5IiwiUGhvbmUiOiIxMzM4MTU1OTYxOCIsIkdyb3VwSUQiOiIxNzI4NzEyMzI0OTcxMjM4MjMxIiwiUGFnZU5hbWUiOiIiLCJNYWlsIjoiIiwiQ3JlYXRlVGltZSI6IjIwMjUtMDYtMDYgMTU6MDU6NTUiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.aw1AUJnBYxXerJ4qNUaXM3DqPTd94WSVHWRiIpnjImhuCia3Ta1AyANTQTx__2CF5eByHOaHJFHhBCg6KgHUEaR6TiWFn0fWwXaU7XgnHwbvD4pNAmF_uYxMKbi-a6IyIGNyFdEMy22V5JEqfY4okAco5U96cnSOQZH7lyIBpvOsesjZU6L9q6Tf2jvlcnO9QG8GPg2DVpeL8Q3zLuYWezN4Wk6N-ISwQmZUwBYL3BhYamsFqCdSEyMd_uYQ_aQJa5tmlQqpimtALiutFshPUXB6VsvXEO6q-lCZ6Tg8QWwlFHkmEtUMQw4pWoX25d7Us06VFUhvV6pOzvM7yqCaWw"
 VITE_BASE_URL=/video_translate/
-VITE_API_BASE_PATH=/video_translate/api
+VITE_API_BASE_PATH=/video_translate/api
+DOUBAO_TIMEOUT_MS=900000
+VITE_ARK_API_KEY="e96194a9-8eda-4a90-a211-6db288045bdc"
--- a/.env.example
+++ b/.env.example
@ -4,6 +4,10 @@ GEMINI_API_KEY="MY_GEMINI_API_KEY"
 # ARK_API_KEY: Required when the editor LLM is set to Doubao.
 ARK_API_KEY="YOUR_ARK_API_KEY"

+# VITE_ARK_API_KEY: Required only if the browser uploads videos directly to Ark Files API.
+# This exposes the key to the frontend and should only be used in trusted environments.
+# VITE_ARK_API_KEY="YOUR_ARK_API_KEY"
+
 # DEFAULT_LLM_PROVIDER: Optional editor default. Supported values: doubao, gemini.
 # Defaults to doubao.
 DEFAULT_LLM_PROVIDER="doubao"
@ -12,11 +16,19 @@ DEFAULT_LLM_PROVIDER="doubao"
 # Defaults to doubao-seed-2-0-pro-260215.
 DOUBAO_MODEL="doubao-seed-2-0-pro-260215"

+# DOUBAO_TIMEOUT_MS: Optional timeout for Doubao subtitle requests in milliseconds.
+# Defaults to 600000 (10 minutes).
+# DOUBAO_TIMEOUT_MS="600000"
+
 # VITE_API_BASE_PATH: Optional frontend API base path.
 # Defaults to /api.
 # Set to /video_translate/api when the app is served under /video_translate.
 # VITE_API_BASE_PATH="/video_translate/api"

+# VITE_ALLOWED_HOSTS: Optional comma-separated hostnames allowed by the Vite dev server.
+# Useful when exposing the dev server through a tunnel such as cpolar.
+# VITE_ALLOWED_HOSTS="ced4302.r20.vip.cpolar.cn"
+
 # MINIMAX_API_KEY: Required for MiniMax TTS API calls.
 # Use a MiniMax API secret key that has TTS access enabled.
 MINIMAX_API_KEY="YOUR_MINIMAX_API_KEY"
--- a/docs/plans/2026-03-18-ubuntu-start-script.md
+++ b/docs/plans/2026-03-18-ubuntu-start-script.md
@ -0,0 +1,62 @@
+# Ubuntu Start Script Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Add Ubuntu development scripts that can start the app in the background and stop it later without requiring the caller to `cd` first.
+
+**Architecture:** Keep Bash entrypoints in the repository root. `start-dev.sh` resolves the project directory, creates a `run/` working area, launches `npm run dev` in a dedicated process group, and records the group leader PID and log path. `stop.sh` reads the recorded PID, stops the whole process group, and removes stale state.
+
+**Tech Stack:** Bash, npm
+
+---
+
+### Task 1: Add Ubuntu start and stop scripts
+
+**Files:**
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\start-dev.sh`
+- Create: `E:\Downloads\ai-video-dubbing-&-translation\stop.sh`
+- Create: `E:\Downloads\ai-video-dubbing-&-translation\docs\plans\2026-03-18-ubuntu-start-script.md`
+
+**Step 1: Define the verification target**
+
+Run: `bash -n ./start-dev.sh`
+Expected: exit code 0 after the script is updated
+
+Run: `bash -n ./stop.sh`
+Expected: exit code 0 after the script is added
+
+**Step 2: Write the minimal implementation**
+
+Update `start-dev.sh` so it:
+- uses `#!/usr/bin/env bash`
+- enables `set -euo pipefail`
+- resolves the script directory
+- changes into that directory
+- creates `run/`
+- starts `npm run dev` in the background as its own process group
+- writes the process id to `run/dev.pid`
+- writes logs to `run/dev.log`
+- refuses to start a second copy if the PID is still alive
+
+Create `stop.sh` so it:
+- resolves the script directory
+- reads `run/dev.pid`
+- sends `TERM` to the whole process group if it is running
+- waits briefly and escalates to `KILL` only if needed
+- removes stale `run/dev.pid`
+
+**Step 3: Run syntax verification**
+
+Run: `bash -n ./start-dev.sh`
+Expected: exit code 0 with no syntax errors
+
+Run: `bash -n ./stop.sh`
+Expected: exit code 0 with no syntax errors
+
+**Step 4: Run an execution smoke check**
+
+Run: `bash ./start-dev.sh`
+Expected: npm starts the development server in the background and prints the PID/log location
+
+Run: `bash ./stop.sh`
+Expected: the background dev process stops and the PID file is removed
--- a/docs/plans/2026-03-19-doubao-file-id-frontend-design.md
+++ b/docs/plans/2026-03-19-doubao-file-id-frontend-design.md
@ -0,0 +1,156 @@
+# Doubao Frontend File ID Upload Design
+
+**Goal:** Let the browser upload videos to Volcengine Ark Files API, then send the returned `file_id` to this app's backend so Doubao subtitle generation can use `Responses API` with `file_id` instead of inline base64 video payloads.
+
+## Context
+
+The current subtitle flow uploads the full video to this app's backend, then the backend reads the file and sends a `data:video/mp4;base64,...` payload to Doubao. That works for smaller files, but it inherits request body size limits and repeats the full video upload on every subtitle generation request.
+
+The user wants a staged flow:
+
+1. Frontend uploads the selected video directly to Ark Files API.
+2. Frontend receives a `file_id`.
+3. Frontend calls this app's `/api/generate-subtitles` endpoint with that `file_id`.
+4. Backend keeps ownership of the Doubao `Responses API` request, logging, normalization, and subtitle result shaping.
+
+## Approaches Considered
+
+### Option A: Frontend uploads to Files API, backend uses `file_id` for Doubao
+
+This keeps the current app architecture mostly intact. Only the upload stage moves to the browser. The backend still handles provider selection, subtitle parsing, error mapping, and normalized response shaping.
+
+**Pros**
+- Smallest architectural change
+- Keeps existing backend logging and response normalization
+- Preserves the existing `/api/generate-subtitles` contract with a backward-compatible extension
+- Allows a gradual rollout because base64 upload can remain as fallback
+
+**Cons**
+- Frontend gains Ark-specific upload logic
+- The browser now coordinates two network calls for Doubao
+
+### Option B: Frontend uploads to Files API and also calls Doubao `Responses API`
+
+This removes backend involvement for Doubao subtitle generation, but it pushes subtitle parsing and normalization into the browser.
+
+**Pros**
+- Shorter network path for Doubao
+
+**Cons**
+- Large frontend refactor
+- Duplicates provider logic across frontend and backend
+- Loses centralized logging and error handling
+- Makes Gemini and Doubao flows diverge more sharply
+
+### Recommendation
+
+Use **Option A**. It solves the request-size problem without discarding the backend subtitle pipeline that already exists.
+
+## Architecture
+
+### Frontend
+
+Add a small Ark upload helper that:
+
+1. Accepts the selected `File`
+2. Sends `FormData` to `https://ark.cn-beijing.volces.com/api/v3/files`
+3. Includes:
+   - `purpose=user_data`
+   - `file=@<video>`
+   - `preprocess_configs[video][fps]=1`
+4. Reads the response JSON and returns the Ark `file_id`
+
+`generateSubtitlePipeline(...)` will gain an optional `options` object. When the provider is `doubao`, it will:
+
+1. Upload the file to Ark
+2. Call this app's `/api/generate-subtitles` with `fileId`, `provider`, `targetLanguage`, and optional `trimRange`
+
+For `gemini`, it will keep the current multipart upload path unchanged.
+
+### Backend
+
+The `/api/generate-subtitles` endpoint will support two request shapes:
+
+1. Existing multipart upload with `video`
+2. New JSON or urlencoded body with `fileId`
+
+The subtitle request parser will be extended to accept optional `fileId`.
+
+The video subtitle generation pipeline will accept either:
+
+1. `videoPath`
+2. `fileId`
+
+For Doubao:
+
+- If `fileId` is present, send:
+  - `type: "input_video"`
+  - `file_id: "<ark-file-id>"`
+  - `fps: 1`
+- If `fileId` is absent, preserve the current base64 fallback path
+
+For Gemini:
+
+- Continue requiring a local uploaded file path
+- Return a clear error if Gemini is requested without `video`
+
+## Data Flow
+
+### Doubao Path
+
+1. User selects video in the browser
+2. `EditorScreen` triggers subtitle generation
+3. Frontend uploads the `File` to Ark Files API
+4. Frontend receives `file_id`
+5. Frontend posts `fileId` to `/api/generate-subtitles`
+6. Backend resolves Doubao provider config
+7. Backend calls Ark `Responses API` with `file_id`
+8. Backend parses and normalizes subtitle JSON
+9. Frontend renders normalized subtitles
+
+### Gemini Path
+
+1. User selects video in the browser
+2. Frontend posts multipart form data with `video`
+3. Backend sends inline video bytes to Gemini as today
+
+## Error Handling
+
+### Frontend Upload Errors
+
+If Ark Files API fails, the frontend should surface a direct upload error and avoid calling this app's backend. The user should see the upstream message when possible.
+
+### Backend Request Validation
+
+The backend should reject requests when:
+
+- Neither `video` nor `fileId` is provided
+- `targetLanguage` is missing
+- `gemini` is requested with `fileId` only
+
+### Provider-Specific Behavior
+
+- `doubao + fileId` uses the new Ark file reference path
+- `doubao + video` remains supported as fallback
+- `gemini + video` remains unchanged
+- `gemini + fileId` returns a clear validation error
+
+## Testing Strategy
+
+### Frontend
+
+- Unit test Ark file upload helper request shape
+- Unit test `generateSubtitlePipeline` uses `fileId` for Doubao and skips multipart video upload to this app's backend
+- Unit test `generateSubtitlePipeline` keeps multipart upload for Gemini
+- UI test `EditorScreen` still passes the selected provider through subtitle generation
+
+### Backend
+
+- Unit test subtitle request parsing with `fileId`
+- Unit test Doubao video generation uses `file_id` when present
+- Unit test base64 fallback remains intact
+- Unit test Gemini path rejects `fileId`-only requests
+
+## Rollout Notes
+
+Keep the base64 Doubao fallback during this change. That makes the new flow additive instead of a risky cutover and keeps local tests simpler while the frontend upload path settles.
--- a/docs/plans/2026-03-19-doubao-file-id-frontend.md
+++ b/docs/plans/2026-03-19-doubao-file-id-frontend.md
@ -0,0 +1,172 @@
+# Doubao Frontend File ID Upload Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Allow the frontend to upload a video to Ark Files API, receive a `file_id`, and send that `file_id` to this app's backend so Doubao subtitle generation uses `Responses API` file references instead of inline base64 video payloads.
+
+**Architecture:** Add a frontend Ark upload helper plus a backward-compatible extension to the subtitle request contract. The backend will accept either an uploaded `video` file or a `fileId`, and the Doubao path will prefer `file_id` while Gemini stays on the current multipart upload flow.
+
+**Tech Stack:** React, TypeScript, Express, Vitest, Fetch API, Volcengine Ark Files API, Volcengine Ark Responses API
+
+---
+
+### Task 1: Add failing frontend tests for the new Doubao request flow
+
+**Files:**
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\src\services\subtitleService.test.ts`
+
+**Step 1: Write the failing tests**
+
+Add tests that verify:
+- Doubao first uploads the file to Ark Files API and then posts `fileId` to `/api/generate-subtitles`
+- Gemini still uploads multipart form data with `video`
+
+**Step 2: Run test to verify it fails**
+
+Run: `npm.cmd run test -- src/services/subtitleService.test.ts`
+Expected: FAIL because the service does not yet upload to Ark or send `fileId`
+
+**Step 3: Write minimal implementation**
+
+Update the frontend subtitle service to support an Ark upload step and dual request modes.
+
+**Step 4: Run test to verify it passes**
+
+Run: `npm.cmd run test -- src/services/subtitleService.test.ts`
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add src/services/subtitleService.test.ts src/services/subtitleService.ts
+git commit -m "feat: add frontend doubao file id upload flow"
+```
+
+### Task 2: Add failing backend tests for `fileId` parsing and Doubao request shape
+
+**Files:**
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\src\server\subtitleRequest.test.ts`
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\src\server\videoSubtitleGeneration.test.ts`
+
+**Step 1: Write the failing tests**
+
+Add tests that verify:
+- Subtitle request parsing accepts `fileId`
+- Doubao `Responses API` request uses `file_id` and `fps`
+- Gemini rejects requests that provide only `fileId`
+
+**Step 2: Run test to verify it fails**
+
+Run: `npm.cmd run test -- src/server/subtitleRequest.test.ts src/server/videoSubtitleGeneration.test.ts`
+Expected: FAIL because parsing and generation do not yet support `fileId`
+
+**Step 3: Write minimal implementation**
+
+Extend server request parsing and video generation to accept `fileId`.
+
+**Step 4: Run test to verify it passes**
+
+Run: `npm.cmd run test -- src/server/subtitleRequest.test.ts src/server/videoSubtitleGeneration.test.ts`
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add src/server/subtitleRequest.test.ts src/server/videoSubtitleGeneration.test.ts src/server/subtitleRequest.ts src/server/videoSubtitleGeneration.ts
+git commit -m "feat: support doubao file id subtitle requests"
+```
+
+### Task 3: Wire the backend route to accept `fileId` without a multipart video upload
+
+**Files:**
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\server.ts`
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\src\server\subtitleGeneration.ts`
+
+**Step 1: Write the failing test**
+
+If route-level coverage is already represented through unit seams, add or update a pipeline test that proves `fileId` can flow into subtitle generation without `videoPath`.
+
+**Step 2: Run test to verify it fails**
+
+Run: `npm.cmd run test -- src/server/subtitleGeneration.test.ts`
+Expected: FAIL because the pipeline still assumes a local video path
+
+**Step 3: Write minimal implementation**
+
+Allow subtitle generation to receive either:
+- `videoPath`
+- `fileId`
+
+Require at least one, and keep backend cleanup safe when no uploaded file exists.
+
+**Step 4: Run test to verify it passes**
+
+Run: `npm.cmd run test -- src/server/subtitleGeneration.test.ts`
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add server.ts src/server/subtitleGeneration.ts src/server/subtitleGeneration.test.ts
+git commit -m "feat: accept file id subtitle generation requests"
+```
+
+### Task 4: Verify editor behavior still works with provider switching
+
+**Files:**
+- Modify: `E:\Downloads\ai-video-dubbing-&-translation\src\components\EditorScreen.test.tsx`
+
+**Step 1: Write the failing test**
+
+Add or update coverage so the editor still calls subtitle generation correctly after the service signature change.
+
+**Step 2: Run test to verify it fails**
+
+Run: `npm.cmd run test -- src/components/EditorScreen.test.tsx`
+Expected: FAIL because mocks or call signatures need updating
+
+**Step 3: Write minimal implementation**
+
+Adjust the editor or tests so the new service contract is reflected without changing the visible UX.
+
+**Step 4: Run test to verify it passes**
+
+Run: `npm.cmd run test -- src/components/EditorScreen.test.tsx`
+Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add src/components/EditorScreen.test.tsx src/components/EditorScreen.tsx
+git commit -m "test: align editor subtitle generation with file id flow"
+```
+
+### Task 5: Run focused regression coverage
+
+**Files:**
+- Test: `E:\Downloads\ai-video-dubbing-&-translation\src\services\subtitleService.test.ts`
+- Test: `E:\Downloads\ai-video-dubbing-&-translation\src\server\subtitleRequest.test.ts`
+- Test: `E:\Downloads\ai-video-dubbing-&-translation\src\server\subtitleGeneration.test.ts`
+- Test: `E:\Downloads\ai-video-dubbing-&-translation\src\server\videoSubtitleGeneration.test.ts`
+- Test: `E:\Downloads\ai-video-dubbing-&-translation\src\components\EditorScreen.test.tsx`
+
+**Step 1: Run the focused suite**
+
+Run: `npm.cmd run test -- src/services/subtitleService.test.ts src/server/subtitleRequest.test.ts src/server/subtitleGeneration.test.ts src/server/videoSubtitleGeneration.test.ts src/components/EditorScreen.test.tsx`
+Expected: PASS
+
+**Step 2: Fix any regressions**
+
+Make only the minimal changes required to keep Doubao and Gemini flows working.
+
+**Step 3: Re-run the focused suite**
+
+Run: `npm.cmd run test -- src/services/subtitleService.test.ts src/server/subtitleRequest.test.ts src/server/subtitleGeneration.test.ts src/server/videoSubtitleGeneration.test.ts src/components/EditorScreen.test.tsx`
+Expected: PASS
+
+**Step 4: Commit**
+
+```bash
+git add src/services/subtitleService.ts src/services/subtitleService.test.ts src/server/subtitleRequest.ts src/server/subtitleRequest.test.ts src/server/subtitleGeneration.ts src/server/subtitleGeneration.test.ts src/server/videoSubtitleGeneration.ts src/server/videoSubtitleGeneration.test.ts src/components/EditorScreen.test.tsx server.ts docs/plans/2026-03-19-doubao-file-id-frontend-design.md docs/plans/2026-03-19-doubao-file-id-frontend.md
+git commit -m "feat: use ark file ids for doubao subtitle generation"
+```
--- a/server.ts
+++ b/server.ts
@ -20,6 +20,7 @@ import {
  DEFAULT_EXPORT_TEXT_STYLES,
  shiftSubtitlesToExportTimeline,
 } from './src/server/exportVideo';
+import { formatLogContext, serializeError } from './src/server/errorLogging';
 import { TextStyles } from './src/types';

 const upload = multer({ 
@ -49,6 +50,7 @@ if (ffprobePath) {
 async function startServer() {
  const app = express();
  const PORT = 3000;
+  const subtitleUpload = upload.single('video');

  app.use(cors());
  app.use(express.json({ limit: '500mb' }));
@ -191,26 +193,55 @@ async function startServer() {
    }
  });

-  app.post('/api/generate-subtitles', upload.single('video'), async (req, res) => {
+  app.post('/api/generate-subtitles', (req, res, next) => {
+    if ((req.headers['content-type'] || '').includes('multipart/form-data')) {
+      return subtitleUpload(req, res, next);
+    }
+
+    next();
+  }, async (req, res) => {
    const videoPath = req.file?.path;
+    const requestId = `${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
+    const startedAt = Date.now();

    try {
-      if (!videoPath) {
+      const { provider, targetLanguage, fileId } = parseSubtitleRequest(req.body);
+      if (!videoPath && !fileId) {
        return res.status(400).json({ error: 'No video file provided' });
      }
-
-      const { provider, targetLanguage } = parseSubtitleRequest(req.body);
+      console.info(
+        `[subtitle] request started ${formatLogContext({
+          requestId,
+          provider,
+          targetLanguage,
+          fileName: req.file?.originalname,
+          fileSize: req.file?.size,
+        })}`,
+      );

      const result = await generateSubtitlePipeline({
        videoPath,
+        fileId,
        provider,
        targetLanguage,
        env: process.env,
+        requestId,
      });

+      console.info(
+        `[subtitle] request succeeded ${formatLogContext({
+          requestId,
+          provider,
+          targetLanguage,
+          durationMs: Date.now() - startedAt,
+          subtitleCount: result.subtitles.length,
+        })}`,
+      );
+
      res.json({
        ...result,
        provider,
+        requestId,
      });
    } catch (error: any) {
      const message = error instanceof Error ? error.message : 'Failed to generate subtitles';
@ -228,8 +259,17 @@ async function startServer() {
            ? 401
            : 502;

-      console.error('Subtitle Generation Error:', error);
-      res.status(status).json({ error: message });
+      console.error(
+        `[subtitle] request failed ${formatLogContext({
+          requestId,
+          durationMs: Date.now() - startedAt,
+          fileName: req.file?.originalname,
+          fileSize: req.file?.size,
+          status,
+        })}`,
+        serializeError(error),
+      );
+      res.status(status).json({ error: message, requestId });
    } finally {
      if (videoPath && fs.existsSync(videoPath)) fs.unlinkSync(videoPath);
    }
--- a/src/components/EditorScreen.test.tsx
+++ b/src/components/EditorScreen.test.tsx
@ -95,4 +95,21 @@ describe('EditorScreen', () => {
      ),
    );
  });
+
+  it('only auto-generates subtitles once in StrictMode', async () => {
+    render(
+      <React.StrictMode>
+        <EditorScreen
+          videoFile={new File(['video'], 'clip.mp4', { type: 'video/mp4' })}
+          targetLanguage="en"
+          trimRange={null}
+          onBack={() => {}}
+        />
+      </React.StrictMode>,
+    );
+
+    await waitFor(() => expect(generateSubtitlePipelineMock).toHaveBeenCalled());
+
+    expect(generateSubtitlePipelineMock).toHaveBeenCalledTimes(1);
+  });
 });
--- a/src/components/EditorScreen.tsx
+++ b/src/components/EditorScreen.tsx
@ -39,6 +39,7 @@ export default function EditorScreen({ videoFile, targetLanguage, trimRange, onB
  const [videoAspectRatio, setVideoAspectRatio] = useState<number>(16/9);
  const containerRef = useRef<HTMLDivElement>(null);
  const [renderedVideoWidth, setRenderedVideoWidth] = useState<number | '100%'>('100%');
+  const autoGenerationKeyRef = useRef<string | null>(null);
  
  // Timeline Dragging State
  const [draggingId, setDraggingId] = useState<string | null>(null);
@ -137,8 +138,21 @@ export default function EditorScreen({ videoFile, targetLanguage, trimRange, onB

  // Generate subtitles on mount
  useEffect(() => {
+    const autoGenerationKey = JSON.stringify({
+      fileName: videoFile?.name || '',
+      fileSize: videoFile?.size || 0,
+      targetLanguage,
+      trimRange,
+      llmProvider,
+    });
+
+    if (autoGenerationKeyRef.current === autoGenerationKey) {
+      return;
+    }
+
+    autoGenerationKeyRef.current = autoGenerationKey;
    fetchSubtitles();
-  }, [fetchSubtitles]);
+  }, [fetchSubtitles, videoFile, targetLanguage, trimRange, llmProvider]);

  const [textStyles, setTextStyles] = useState<TextStyles>({
    fontFamily: 'MiSans-Late',
--- a/src/server/errorLogging.test.ts
+++ b/src/server/errorLogging.test.ts
@ -0,0 +1,36 @@
+import { describe, expect, it } from 'vitest';
+import { formatLogContext, serializeError } from './errorLogging';
+
+describe('errorLogging', () => {
+  it('serializes nested causes and error codes', () => {
+    const cause = new Error('Headers Timeout Error');
+    const error = new TypeError('fetch failed', { cause }) as TypeError & {
+      code?: string;
+    };
+    error.code = 'UND_ERR_HEADERS_TIMEOUT';
+
+    expect(serializeError(error)).toEqual(
+      expect.objectContaining({
+        message: 'fetch failed',
+        name: 'TypeError',
+        code: 'UND_ERR_HEADERS_TIMEOUT',
+        cause: {
+          message: 'Headers Timeout Error',
+          name: 'Error',
+          code: undefined,
+        },
+      }),
+    );
+  });
+
+  it('formats context as stable key-value pairs', () => {
+    expect(
+      formatLogContext({
+        requestId: 'req-1',
+        provider: 'doubao',
+        durationMs: 1234,
+        ignored: undefined,
+      }),
+    ).toBe('requestId=req-1 provider=doubao durationMs=1234');
+  });
+});
--- a/src/server/errorLogging.ts
+++ b/src/server/errorLogging.ts
@ -0,0 +1,51 @@
+export interface ErrorLogDetails {
+  message: string;
+  stack?: string;
+  cause?: unknown;
+  code?: string;
+  name?: string;
+}
+
+export const serializeError = (error: unknown): ErrorLogDetails => {
+  if (error instanceof Error) {
+    const details: ErrorLogDetails = {
+      message: error.message,
+      name: error.name,
+      stack: error.stack,
+    };
+
+    const errorWithCode = error as Error & { code?: string; cause?: unknown };
+    if (errorWithCode.code) {
+      details.code = errorWithCode.code;
+    }
+
+    if (errorWithCode.cause !== undefined) {
+      details.cause = serializeCause(errorWithCode.cause);
+    }
+
+    return details;
+  }
+
+  return {
+    message: typeof error === 'string' ? error : JSON.stringify(error),
+  };
+};
+
+export const formatLogContext = (context: Record<string, unknown>) =>
+  Object.entries(context)
+    .filter(([, value]) => value !== undefined)
+    .map(([key, value]) => `${key}=${String(value)}`)
+    .join(' ');
+
+const serializeCause = (cause: unknown): unknown => {
+  if (cause instanceof Error) {
+    const nested = serializeError(cause);
+    return {
+      message: nested.message,
+      name: nested.name,
+      code: nested.code,
+    };
+  }
+
+  return cause;
+};
--- a/src/server/llmProvider.test.ts
+++ b/src/server/llmProvider.test.ts
@ -1,6 +1,7 @@
 import { describe, expect, it } from 'vitest';
 import {
  DEFAULT_DOUBAO_MODEL,
+  DEFAULT_DOUBAO_TIMEOUT_MS,
  DEFAULT_LLM_PROVIDER,
  normalizeLlmProvider,
  resolveLlmProviderConfig,
@ -30,6 +31,22 @@ describe('llmProvider', () => {
      apiKey: 'ark-key',
      model: DEFAULT_DOUBAO_MODEL,
      baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
+      timeoutMs: DEFAULT_DOUBAO_TIMEOUT_MS,
+    });
+  });
+
+  it('reads a custom doubao timeout from env', () => {
+    expect(
+      resolveLlmProviderConfig('doubao', {
+        ARK_API_KEY: 'ark-key',
+        DOUBAO_TIMEOUT_MS: '600000',
+      }),
+    ).toEqual({
+      provider: 'doubao',
+      apiKey: 'ark-key',
+      model: DEFAULT_DOUBAO_MODEL,
+      baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
+      timeoutMs: 600000,
    });
  });

--- a/src/server/llmProvider.ts
+++ b/src/server/llmProvider.ts
@ -2,6 +2,7 @@ export const DEFAULT_LLM_PROVIDER = 'doubao';
 export const DEFAULT_DOUBAO_MODEL = 'doubao-seed-2-0-pro-260215';
 export const DEFAULT_GEMINI_MODEL = 'gemini-2.5-flash';
 export const DEFAULT_DOUBAO_RESPONSES_URL = 'https://ark.cn-beijing.volces.com/api/v3/responses';
+export const DEFAULT_DOUBAO_TIMEOUT_MS = 600000;

 export type LlmProvider = 'doubao' | 'gemini';

@ -10,6 +11,7 @@ export interface DoubaoProviderConfig {
  apiKey: string;
  model: string;
  baseUrl: string;
+  timeoutMs: number;
 }

 export interface GeminiProviderConfig {
@ -20,6 +22,15 @@ export interface GeminiProviderConfig {

 export type LlmProviderConfig = DoubaoProviderConfig | GeminiProviderConfig;

+const resolveDoubaoTimeoutMs = (value?: string) => {
+  const parsed = Number(value?.trim());
+  if (!Number.isFinite(parsed) || parsed <= 0) {
+    return DEFAULT_DOUBAO_TIMEOUT_MS;
+  }
+
+  return Math.floor(parsed);
+};
+
 export const normalizeLlmProvider = (value?: string | null): LlmProvider => {
  if (!value) {
    return DEFAULT_LLM_PROVIDER;
@ -48,6 +59,7 @@ export const resolveLlmProviderConfig = (
      apiKey,
      model: env.DOUBAO_MODEL?.trim() || DEFAULT_DOUBAO_MODEL,
      baseUrl: (env.DOUBAO_BASE_URL?.trim() || DEFAULT_DOUBAO_RESPONSES_URL).replace(/\/+$/, ''),
+      timeoutMs: resolveDoubaoTimeoutMs(env.DOUBAO_TIMEOUT_MS),
    };
  }

--- a/src/server/providerTranslation.test.ts
+++ b/src/server/providerTranslation.test.ts
@ -16,6 +16,7 @@ describe('createSentenceTranslator', () => {
      apiKey: 'ark-key',
      model: 'doubao-seed-2-0-pro-260215',
      baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
+      timeoutMs: 600000,
    });

    expect(translator).toBe('doubao-translator');
--- a/src/server/subtitleGeneration.test.ts
+++ b/src/server/subtitleGeneration.test.ts
@ -66,6 +66,7 @@ describe('generateSubtitlePipeline', () => {
          apiKey: 'ark-key',
          model: 'doubao-seed-2-0-pro-260215',
          baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
+          timeoutMs: 600000,
        },
      }),
    );
@ -89,6 +90,7 @@ describe('generateSubtitlePipeline', () => {
        ARK_API_KEY: 'ark-key',
      },
      fetchImpl,
+      requestId: 'req-123',
      deps: {
        generateSubtitlesFromVideo,
      },
@ -97,6 +99,36 @@ describe('generateSubtitlePipeline', () => {
    expect(generateSubtitlesFromVideo).toHaveBeenCalledWith(
      expect.objectContaining({
        fetchImpl,
+        requestId: 'req-123',
+      }),
+    );
+  });
+
+  it('passes file id through to video subtitle generation', async () => {
+    const subtitleResult: SubtitlePipelineResult = {
+      subtitles: [],
+      speakers: [],
+      quality: 'fallback',
+      targetLanguage: 'English',
+    };
+    const generateSubtitlesFromVideo = vi.fn(async () => subtitleResult);
+
+    await generateSubtitlePipeline({
+      fileId: 'file-123',
+      targetLanguage: 'English',
+      provider: 'doubao',
+      env: {
+        ARK_API_KEY: 'ark-key',
+      },
+      deps: {
+        generateSubtitlesFromVideo,
+      },
+    });
+
+    expect(generateSubtitlesFromVideo).toHaveBeenCalledWith(
+      expect.objectContaining({
+        fileId: 'file-123',
+        videoPath: undefined,
      }),
    );
  });
--- a/src/server/subtitleGeneration.ts
+++ b/src/server/subtitleGeneration.ts
@ -3,11 +3,13 @@ import { resolveLlmProviderConfig, normalizeLlmProvider } from './llmProvider';
 import { generateSubtitlesFromVideo as defaultGenerateSubtitlesFromVideo } from './videoSubtitleGeneration';

 export interface GenerateSubtitlePipelineOptions {
-  videoPath: string;
+  videoPath?: string;
+  fileId?: string;
  targetLanguage: string;
  provider?: string | null;
  env: NodeJS.ProcessEnv;
  fetchImpl?: typeof fetch;
+  requestId?: string;
  deps?: {
    generateSubtitlesFromVideo?: typeof defaultGenerateSubtitlesFromVideo;
  };
@ -15,12 +17,18 @@ export interface GenerateSubtitlePipelineOptions {

 export const generateSubtitlePipeline = async ({
  videoPath,
+  fileId,
  targetLanguage,
  provider,
  env,
  fetchImpl,
+  requestId,
  deps,
 }: GenerateSubtitlePipelineOptions) => {
+  if (!videoPath && !fileId) {
+    throw new Error('A video upload or fileId is required.');
+  }
+
  const audioPipelineConfig = resolveAudioPipelineConfig(env);
  const selectedProvider = provider
    ? normalizeLlmProvider(provider)
@ -32,7 +40,9 @@ export const generateSubtitlePipeline = async ({
  return generateSubtitlesFromVideo({
    providerConfig,
    videoPath,
+    fileId,
    targetLanguage,
+    requestId,
    ...(fetchImpl ? { fetchImpl } : {}),
  });
 };
--- a/src/server/subtitleRequest.test.ts
+++ b/src/server/subtitleRequest.test.ts
@ -26,4 +26,17 @@ describe('parseSubtitleRequest', () => {
      /target language/i,
    );
  });
+
+  it('preserves a file id when provided', () => {
+    expect(
+      parseSubtitleRequest({
+        targetLanguage: 'English',
+        fileId: 'file-123',
+      }),
+    ).toEqual({
+      provider: 'doubao',
+      targetLanguage: 'English',
+      fileId: 'file-123',
+    });
+  });
 });
--- a/src/server/subtitleRequest.ts
+++ b/src/server/subtitleRequest.ts
@ -3,11 +3,13 @@ import { LlmProvider, normalizeLlmProvider } from './llmProvider';
 export interface SubtitleRequestBody {
  provider?: string | null;
  targetLanguage?: string | null;
+  fileId?: string | null;
 }

 export interface ParsedSubtitleRequest {
  provider: LlmProvider;
  targetLanguage: string;
+  fileId?: string;
 }

 export const parseSubtitleRequest = (
@ -21,5 +23,6 @@ export const parseSubtitleRequest = (
  return {
    provider: normalizeLlmProvider(body.provider),
    targetLanguage,
+    ...(body.fileId?.trim() ? { fileId: body.fileId.trim() } : {}),
  };
 };
--- a/src/server/videoSubtitleGeneration.test.ts
+++ b/src/server/videoSubtitleGeneration.test.ts
@ -0,0 +1,126 @@
+import fs from 'fs';
+import { afterEach, describe, expect, it, vi } from 'vitest';
+import { generateSubtitlesFromVideo } from './videoSubtitleGeneration';
+
+describe('generateSubtitlesFromVideo', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('passes the configured doubao timeout to fetch', async () => {
+    vi.spyOn(fs, 'readFileSync').mockReturnValue(Buffer.from('video-bytes'));
+    const fetchImpl = vi.fn<typeof fetch>(async () =>
+      new Response(
+        JSON.stringify({
+          output: [
+            {
+              content: [
+                {
+                  text: JSON.stringify({
+                    sourceLanguage: 'zh',
+                    subtitles: [
+                      {
+                        originalText: '你好',
+                        translatedText: 'Hello',
+                        startTime: 0,
+                        endTime: 1,
+                      },
+                    ],
+                  }),
+                },
+              ],
+            },
+          ],
+        }),
+        {
+          status: 200,
+          headers: { 'Content-Type': 'application/json' },
+        },
+      ),
+    );
+
+    await generateSubtitlesFromVideo({
+      providerConfig: {
+        provider: 'doubao',
+        apiKey: 'ark-key',
+        model: 'doubao-seed-2-0-pro-260215',
+        baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
+        timeoutMs: 600000,
+      },
+      videoPath: 'clip.mp4',
+      targetLanguage: 'English',
+      fetchImpl,
+    });
+
+    expect(fetchImpl).toHaveBeenCalledWith(
+      'https://ark.cn-beijing.volces.com/api/v3/responses',
+      expect.objectContaining({
+        method: 'POST',
+        signal: expect.any(AbortSignal),
+      }),
+    );
+  });
+
+  it('uses ark file ids for doubao requests when available', async () => {
+    const fetchImpl = vi.fn<typeof fetch>(async () =>
+      new Response(
+        JSON.stringify({
+          output: [
+            {
+              content: [
+                {
+                  text: JSON.stringify({
+                    sourceLanguage: 'zh',
+                    subtitles: [
+                      {
+                        originalText: '你好',
+                        translatedText: 'Hello',
+                        startTime: 0,
+                        endTime: 1,
+                      },
+                    ],
+                  }),
+                },
+              ],
+            },
+          ],
+        }),
+        {
+          status: 200,
+          headers: { 'Content-Type': 'application/json' },
+        },
+      ),
+    );
+
+    await generateSubtitlesFromVideo({
+      providerConfig: {
+        provider: 'doubao',
+        apiKey: 'ark-key',
+        model: 'doubao-seed-2-0-pro-260215',
+        baseUrl: 'https://ark.cn-beijing.volces.com/api/v3/responses',
+        timeoutMs: 600000,
+      },
+      fileId: 'file-123',
+      targetLanguage: 'English',
+      fetchImpl,
+    });
+
+    const [, request] = fetchImpl.mock.calls[0] as [string, RequestInit];
+    const payload = JSON.parse(String(request.body));
+
+    expect(payload.input[0].role).toBe('system');
+    expect(payload.input[0].content[0].type).toBe('input_text');
+    expect(payload.input[0].content[0].text).toContain('# Role');
+    expect(payload.input[0].content[0].text).toContain('Voice Selection');
+
+    expect(payload.input[1].role).toBe('user');
+    expect(payload.input[1].content[0]).toEqual({
+      type: 'input_video',
+      file_id: 'file-123',
+    });
+    expect(payload.input[1].content[1].type).toBe('input_text');
+    expect(payload.input[1].content[1].text).toContain('Target language: English');
+    expect(payload.input[1].content[1].text).toContain('Available voices');
+    expect(payload.input[1].content[1].text).toContain('Sweet_Girl');
+  });
+});
--- a/src/server/videoSubtitleGeneration.ts
+++ b/src/server/videoSubtitleGeneration.ts
@ -1,6 +1,8 @@
 import fs from 'fs';
 import { GoogleGenAI } from '@google/genai';
 import { SubtitlePipelineResult } from '../types';
+import { MINIMAX_VOICES } from '../voices';
+import { formatLogContext, serializeError } from './errorLogging';
 import { DoubaoProviderConfig, GeminiProviderConfig, LlmProviderConfig } from './llmProvider';

 interface RawModelSubtitle {
@ -19,13 +21,7 @@ interface RawModelResponse {
 }

 const DEFAULT_VOICE_ID = 'male-qn-qingse';
-const SUPPORTED_VOICE_IDS = new Set([
-  DEFAULT_VOICE_ID,
-  'female-shaonv',
-  'female-yujie',
-  'male-qn-jingying',
-  'male-qn-badao',
-]);
+const SUPPORTED_VOICE_IDS = new Set(MINIMAX_VOICES.map((voice) => voice.id));

 const stripJsonFences = (text: string) => text.replace(/```json\n?|\n?```/g, '').trim();

@ -60,10 +56,85 @@ const sanitizeVoiceId = (value: unknown) => {
  return SUPPORTED_VOICE_IDS.has(value) ? value : DEFAULT_VOICE_ID;
 };

-const createPrompt = (targetLanguage: string) => `You are a subtitle localization engine.
-Analyze the input video and output STRICT JSON only.
+const LANGUAGE_ALIASES: Record<string, string> = {
+  zh: 'zh',
+  chinese: 'zh',
+  mandarin: 'zh',
+  'chinese mandarin': 'zh',
+  english: 'en',
+  en: 'en',
+  french: 'fr',
+  fr: 'fr',
+  indonesian: 'id',
+  id: 'id',
+  german: 'de',
+  de: 'de',
+  filipino: 'fil',
+  fil: 'fil',
+  cantonese: 'yue',
+  yue: 'yue',
+};

-Return an object:
+const normalizeTargetLanguageCode = (targetLanguage: string) =>
+  LANGUAGE_ALIASES[targetLanguage.trim().toLowerCase()] || targetLanguage.trim().toLowerCase();
+
+const formatVoiceCatalogForPrompt = (targetLanguage: string) => {
+  const languageCode = normalizeTargetLanguageCode(targetLanguage);
+  const matchingVoices = MINIMAX_VOICES.filter((voice) => voice.language === languageCode);
+  const voices = matchingVoices.length > 0 ? matchingVoices : MINIMAX_VOICES;
+
+  return voices
+    .map(
+      (voice) =>
+        `- voiceId: ${voice.id} | gender: ${voice.gender} | language: ${voice.language} | tag: ${voice.tag} | name: ${voice.name}`,
+    )
+    .join('\n');
+};
+
+const createSystemPrompt = () => `# Role
+You are a senior film and TV subtitle expert and an advanced localization translator.
+You deeply understand screen reading experience.
+Subtitles must be short, easy to read, precisely timed to the visuals and speech rhythm, and must never cause viewer reading fatigue.
+
+# Task
+Listen to and watch the user-provided audio or video.
+Transcribe the spoken content and translate it into the target language specified by the user.
+Extract highly accurate start and end timestamps, speaker labels, and speaker gender.
+Select the most suitable voiceId for each subtitle item by matching the speaker's gender, tone, style, and delivery to the voice options provided by the user.
+Return the result strictly in the required JSON format.
+
+# Constraints
+1. Strict Subtitle Splitting:
+Absolutely do not accumulate or merge long sentences into oversized subtitle lines.
+You must split subtitles according to the speaker's actual breathing, pauses, commas, short hesitations, and natural phrasing.
+
+2. Screen-Friendly Length:
+Each subtitle item must be short.
+Chinese text should ideally stay within 15 to 20 characters.
+English text should ideally stay within 7 to 10 words.
+If a sentence is too long, you must split it into multiple subtitle objects with consecutive timestamps.
+
+3. Highly Precise Timestamps:
+Timestamps must align closely with the actual speech.
+Use floating-point seconds.
+The duration of a single subtitle item should usually not exceed 3 to 5 seconds.
+
+4. Speaker and Gender:
+Accurately identify the speaker label and speaker gender.
+Gender must be either "male" or "female".
+
+5. Voice Selection:
+The user will provide the target language and a list of available voices.
+Each voice includes a voiceId and descriptive metadata.
+You must analyze the user-provided voice list and choose the best matching voiceId for each subtitle item.
+Only return a voiceId that exists in the user-provided voice list.
+Do not invent new voiceId values.
+
+6. Output Format:
+Return only valid JSON.
+Do not output markdown, code fences, explanations, or any extra text.
+
+Return an object with this exact structure:
 {
  "sourceLanguage": "detected language code",
  "subtitles": [
@ -72,19 +143,29 @@ Return an object:
      "startTime": 0.0,
      "endTime": 1.2,
      "originalText": "source dialogue",
-      "translatedText": "translated dialogue in ${targetLanguage}",
+      "translatedText": "translated dialogue in the target language",
      "speaker": "short speaker label",
-      "voiceId": "one of: male-qn-qingse, female-shaonv, female-yujie, male-qn-jingying, male-qn-badao"
+      "gender": "male or female",
+      "voiceId": "one of the user-provided voice ids"
    }
  ]
 }

-Rules:
-1. Use video timeline seconds for startTime/endTime.
+Additional rules:
+1. Use video timeline seconds for startTime and endTime.
 2. Keep subtitles chronological and non-overlapping.
-3. Do not invent dialogue if not audible.
-4. translatedText must be in ${targetLanguage}.
-5. Do not include markdown. JSON only.`;
+3. Do not invent dialogue if it is not actually audible.
+4. Preserve meaning naturally while keeping subtitle lines short and readable.
+5. If a long utterance must be split, preserve continuity across consecutive subtitle items.
+6. Output JSON only.`;
+
+const createUserPrompt = (targetLanguage: string) => `Target language: ${targetLanguage}
+
+Available voices:
+${formatVoiceCatalogForPrompt(targetLanguage)}
+
+Please watch and listen to the provided video.
+Transcribe the dialogue, translate it into ${targetLanguage}, and assign the best matching voiceId from the available voices for each subtitle item.`;

 const normalizeSubtitles = (raw: RawModelSubtitle[]) => {
  let lastEnd = 0;
@ -135,40 +216,91 @@ const extractDoubaoTextOutput = (payload: any): string => {
 const generateWithDoubao = async ({
  config,
  videoDataUrl,
+  fileId,
  targetLanguage,
  fetchImpl = fetch,
+  requestId,
 }: {
  config: DoubaoProviderConfig;
-  videoDataUrl: string;
+  videoDataUrl?: string;
+  fileId?: string;
  targetLanguage: string;
  fetchImpl?: typeof fetch;
+  requestId?: string;
 }) => {
-  const response = await fetchImpl(config.baseUrl, {
-    method: 'POST',
-    headers: {
-      Authorization: `Bearer ${config.apiKey}`,
-      'Content-Type': 'application/json',
-    },
-    body: JSON.stringify({
-      model: config.model,
-      input: [
-        {
-          role: 'user',
-          content: [
-            { type: 'input_video', video_url: videoDataUrl },
-            { type: 'input_text', text: createPrompt(targetLanguage) },
-          ],
-        },
-      ],
-    }),
+  const startedAt = Date.now();
+  const logContext = formatLogContext({
+    requestId,
+    provider: 'doubao',
+    timeoutMs: config.timeoutMs,
+    targetLanguage,
  });

+  console.info(`[subtitle] doubao request started ${logContext}`);
+
+  let response: Response;
+  try {
+    response = await fetchImpl(config.baseUrl, {
+      method: 'POST',
+      signal: AbortSignal.timeout(config.timeoutMs),
+      headers: {
+        Authorization: `Bearer ${config.apiKey}`,
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        model: config.model,
+        input: [
+          {
+            role: 'system',
+            content: [
+              { type: 'input_text', text: createSystemPrompt() },
+            ],
+          },
+          {
+            role: 'user',
+            content: [
+              fileId
+                ? { type: 'input_video', file_id: fileId }
+                : { type: 'input_video', video_url: videoDataUrl },
+              { type: 'input_text', text: createUserPrompt(targetLanguage) },
+            ],
+          },
+        ],
+      }),
+    });
+  } catch (error) {
+    console.error(
+      `[subtitle] doubao request failed ${formatLogContext({
+        requestId,
+        provider: 'doubao',
+        timeoutMs: config.timeoutMs,
+        durationMs: Date.now() - startedAt,
+      })}`,
+      serializeError(error),
+    );
+    throw error;
+  }
+
  if (!response.ok) {
    const payload = await response.text();
+    console.error(
+      `[subtitle] doubao request returned non-200 ${formatLogContext({
+        requestId,
+        status: response.status,
+        durationMs: Date.now() - startedAt,
+      })}`,
+      payload,
+    );
    throw new Error(`Doubao subtitle request failed (${response.status}): ${payload}`);
  }

  const payload = await response.json();
+  console.info(
+    `[subtitle] doubao request finished ${formatLogContext({
+      requestId,
+      durationMs: Date.now() - startedAt,
+    })}`,
+  );
  const text = extractDoubaoTextOutput(payload);
  return extractJson(text);
 };
@ -195,7 +327,7 @@ const generateWithGemini = async ({
              data: videoBase64,
            },
          },
-          { text: createPrompt(targetLanguage) },
+          { text: `${createSystemPrompt()}\n\n${createUserPrompt(targetLanguage)}` },
        ],
      },
    ],
@ -207,29 +339,39 @@ const generateWithGemini = async ({
 export const generateSubtitlesFromVideo = async ({
  providerConfig,
  videoPath,
+  fileId,
  targetLanguage,
  fetchImpl = fetch,
+  requestId,
 }: {
  providerConfig: LlmProviderConfig;
-  videoPath: string;
+  videoPath?: string;
+  fileId?: string;
  targetLanguage: string;
  fetchImpl?: typeof fetch;
+  requestId?: string;
 }): Promise<SubtitlePipelineResult> => {
-  const videoBuffer = fs.readFileSync(videoPath);
-  const videoBase64 = videoBuffer.toString('base64');
-  const videoDataUrl = `data:video/mp4;base64,${videoBase64}`;
+  if (providerConfig.provider === 'gemini' && !videoPath) {
+    throw new Error('Gemini subtitle generation requires an uploaded video file.');
+  }
+
+  const videoBuffer = videoPath ? fs.readFileSync(videoPath) : null;
+  const videoBase64 = videoBuffer?.toString('base64');
+  const videoDataUrl = videoBase64 ? `data:video/mp4;base64,${videoBase64}` : undefined;

  const raw =
    providerConfig.provider === 'doubao'
      ? await generateWithDoubao({
          config: providerConfig,
          videoDataUrl,
+          fileId,
          targetLanguage,
          fetchImpl,
+          requestId,
        })
      : await generateWithGemini({
          config: providerConfig,
-          videoBase64,
+          videoBase64: videoBase64!,
          targetLanguage,
        });

--- a/src/services/subtitleService.test.ts
+++ b/src/services/subtitleService.test.ts
@ -1,10 +1,19 @@
 // @vitest-environment jsdom

-import { describe, expect, it, vi } from 'vitest';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import { generateSubtitlePipeline } from './subtitleService';

 describe('generateSubtitlePipeline', () => {
-  it('posts the selected provider to the server', async () => {
+  beforeEach(() => {
+    vi.stubEnv('VITE_ARK_API_KEY', 'ark-key');
+  });
+
+  afterEach(() => {
+    vi.unstubAllEnvs();
+    vi.useRealTimers();
+  });
+
+  it('posts the selected provider to the server for gemini', async () => {
    const fetchMock = vi.fn(async () =>
      new Response(
        JSON.stringify({
@ -24,7 +33,7 @@ describe('generateSubtitlePipeline', () => {
    await generateSubtitlePipeline(
      new File(['video'], 'clip.mp4', { type: 'video/mp4' }),
      'English',
-      'doubao',
+      'gemini',
      null,
      fetchMock as unknown as typeof fetch,
    );
@ -40,6 +49,210 @@ describe('generateSubtitlePipeline', () => {
    const [, requestInit] = fetchMock.mock.calls[0] as unknown as [string, RequestInit];
    const formData = requestInit.body as FormData;
    expect(formData.get('targetLanguage')).toBe('English');
-    expect(formData.get('provider')).toBe('doubao');
+    expect(formData.get('provider')).toBe('gemini');
+  });
+
+  it('uploads doubao videos to ark files before requesting subtitles', async () => {
+    vi.useFakeTimers();
+    const fetchMock = vi
+      .fn()
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            id: 'file-123',
+          }),
+          {
+            status: 200,
+            headers: {
+              'Content-Type': 'application/json',
+            },
+          },
+        ),
+      )
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            id: 'file-123',
+            status: 'processing',
+          }),
+          {
+            status: 200,
+            headers: {
+              'Content-Type': 'application/json',
+            },
+          },
+        ),
+      )
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            id: 'file-123',
+            status: 'active',
+          }),
+          {
+            status: 200,
+            headers: {
+              'Content-Type': 'application/json',
+            },
+          },
+        ),
+      )
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            subtitles: [],
+            speakers: [],
+            quality: 'fallback',
+          }),
+          {
+            status: 200,
+            headers: {
+              'Content-Type': 'application/json',
+            },
+          },
+        ),
+      );
+
+    const promise = generateSubtitlePipeline(
+      new File(['video'], 'clip.mp4', { type: 'video/mp4' }),
+      'English',
+      'doubao',
+      null,
+      fetchMock as unknown as typeof fetch,
+    );
+    await vi.runAllTimersAsync();
+    await promise;
+
+    expect(fetchMock).toHaveBeenNthCalledWith(
+      1,
+      'https://ark.cn-beijing.volces.com/api/v3/files',
+      expect.objectContaining({
+        method: 'POST',
+        body: expect.any(FormData),
+      }),
+    );
+
+    expect(fetchMock).toHaveBeenNthCalledWith(
+      2,
+      'https://ark.cn-beijing.volces.com/api/v3/files/file-123',
+      expect.objectContaining({
+        method: 'GET',
+        headers: {
+          Authorization: 'Bearer ark-key',
+        },
+      }),
+    );
+
+    expect(fetchMock).toHaveBeenNthCalledWith(
+      3,
+      'https://ark.cn-beijing.volces.com/api/v3/files/file-123',
+      expect.objectContaining({
+        method: 'GET',
+        headers: {
+          Authorization: 'Bearer ark-key',
+        },
+      }),
+    );
+
+    const [, subtitleRequest] = fetchMock.mock.calls[3] as unknown as [string, RequestInit];
+    const subtitleBody = JSON.parse(String(subtitleRequest.body));
+
+    expect(fetchMock).toHaveBeenNthCalledWith(
+      4,
+      '/api/generate-subtitles',
+      expect.objectContaining({
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+      }),
+    );
+    expect(subtitleBody).toEqual({
+      fileId: 'file-123',
+      provider: 'doubao',
+      targetLanguage: 'English',
+    });
+  });
+
+  it('stops when ark reports file preprocessing failure', async () => {
+    const fetchMock = vi
+      .fn()
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            id: 'file-123',
+          }),
+          {
+            status: 200,
+            headers: {
+              'Content-Type': 'application/json',
+            },
+          },
+        ),
+      )
+      .mockResolvedValueOnce(
+        new Response(
+          JSON.stringify({
+            id: 'file-123',
+            status: 'failed',
+            error: {
+              message: 'video preprocess failed',
+            },
+          }),
+          {
+            status: 200,
+            headers: {
+              'Content-Type': 'application/json',
+            },
+          },
+        ),
+      );
+
+    await expect(
+      generateSubtitlePipeline(
+        new File(['video'], 'clip.mp4', { type: 'video/mp4' }),
+        'English',
+        'doubao',
+        null,
+        fetchMock as unknown as typeof fetch,
+      ),
+    ).rejects.toThrow(/video preprocess failed/i);
+
+    expect(fetchMock).toHaveBeenCalledTimes(2);
+  });
+
+  it('keeps multipart uploads for gemini requests', async () => {
+    const fetchMock = vi.fn(async () =>
+      new Response(
+        JSON.stringify({
+          subtitles: [],
+          speakers: [],
+          quality: 'fallback',
+        }),
+        {
+          status: 200,
+          headers: {
+            'Content-Type': 'application/json',
+          },
+        },
+      ),
+    );
+
+    await generateSubtitlePipeline(
+      new File(['video'], 'clip.mp4', { type: 'video/mp4' }),
+      'English',
+      'gemini',
+      null,
+      fetchMock as unknown as typeof fetch,
+    );
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    expect(fetchMock).toHaveBeenCalledWith(
+      '/api/generate-subtitles',
+      expect.objectContaining({
+        method: 'POST',
+        body: expect.any(FormData),
+      }),
+    );
  });
 });
--- a/src/services/subtitleService.ts
+++ b/src/services/subtitleService.ts
@ -5,6 +5,10 @@ type JsonResponseResult<T> =
  | { ok: true; status: number; data: T }
  | { ok: false; status: number; error: string };

+const ARK_FILES_URL = 'https://ark.cn-beijing.volces.com/api/v3/files';
+const ARK_FILE_STATUS_POLL_INTERVAL_MS = 1000;
+const ARK_FILE_STATUS_TIMEOUT_MS = 120000;
+
 const normalizePipelineQuality = (value: unknown): PipelineQuality => {
  if (value === 'full' || value === 'partial' || value === 'fallback') {
    return value;
@ -38,6 +42,90 @@ const readJsonResponseOnce = async <T>(resp: Response): Promise<JsonResponseResu
  };
 };

+const uploadDoubaoVideoFile = async (
+  videoFile: File,
+  fetchImpl: typeof fetch,
+): Promise<{ fileId: string; apiKey: string }> => {
+  const apiKey = import.meta.env.VITE_ARK_API_KEY?.trim();
+  if (!apiKey) {
+    throw new Error('VITE_ARK_API_KEY is required for frontend Doubao file uploads.');
+  }
+
+  const formData = new FormData();
+  formData.append('purpose', 'user_data');
+  formData.append('file', videoFile);
+  formData.append('preprocess_configs[video][fps]', '1');
+
+  const resp = await fetchImpl(ARK_FILES_URL, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+    },
+    body: formData,
+  });
+
+  const parsed = await readJsonResponseOnce<{ id?: string }>(resp);
+  if (parsed.ok === false) {
+    throw new Error(parsed.error);
+  }
+
+  const fileId = parsed.data.id?.trim();
+  if (!fileId) {
+    throw new Error('Ark Files API did not return a file id.');
+  }
+
+  return { fileId, apiKey };
+};
+
+const sleep = (durationMs: number) =>
+  new Promise((resolve) => {
+    setTimeout(resolve, durationMs);
+  });
+
+const waitForArkFileToBecomeActive = async (
+  fileId: string,
+  apiKey: string,
+  fetchImpl: typeof fetch,
+): Promise<void> => {
+  const deadline = Date.now() + ARK_FILE_STATUS_TIMEOUT_MS;
+
+  while (true) {
+    const resp = await fetchImpl(`${ARK_FILES_URL}/${fileId}`, {
+      method: 'GET',
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+      },
+    });
+
+    const parsed = await readJsonResponseOnce<{
+      status?: string;
+      error?: { message?: string } | string;
+    }>(resp);
+    if (parsed.ok === false) {
+      throw new Error(parsed.error);
+    }
+
+    const status = parsed.data.status?.trim().toLowerCase();
+    if (status === 'active') {
+      return;
+    }
+
+    if (status === 'failed') {
+      const errorMessage =
+        typeof parsed.data.error === 'string'
+          ? parsed.data.error
+          : parsed.data.error?.message || 'Ark file preprocessing failed.';
+      throw new Error(errorMessage);
+    }
+
+    if (Date.now() >= deadline) {
+      throw new Error('Timed out while waiting for Ark file preprocessing to complete.');
+    }
+
+    await sleep(ARK_FILE_STATUS_POLL_INTERVAL_MS);
+  }
+};
+
 export const generateSubtitlePipeline = async (
  videoFile: File,
  targetLanguage: string,
@ -49,6 +137,41 @@ export const generateSubtitlePipeline = async (
    throw new Error('Target language is required.');
  }

+  if (provider === 'doubao') {
+    const { fileId, apiKey } = await uploadDoubaoVideoFile(videoFile, fetchImpl);
+    await waitForArkFileToBecomeActive(fileId, apiKey, fetchImpl);
+    const resp = await fetchImpl(apiUrl('/generate-subtitles'), {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        fileId,
+        targetLanguage,
+        provider,
+        ...(trimRange ? { trimRange } : {}),
+      }),
+    });
+
+    const parsed = await readJsonResponseOnce<Partial<SubtitlePipelineResult>>(resp);
+    if (parsed.ok === false) {
+      const error = new Error(parsed.error);
+      (error as any).status = resp.status;
+      throw error;
+    }
+
+    return {
+      subtitles: Array.isArray(parsed.data.subtitles) ? parsed.data.subtitles : [],
+      speakers: Array.isArray(parsed.data.speakers) ? parsed.data.speakers : [],
+      quality: normalizePipelineQuality(parsed.data.quality),
+      sourceLanguage: parsed.data.sourceLanguage,
+      targetLanguage: parsed.data.targetLanguage || targetLanguage,
+      duration:
+        typeof parsed.data.duration === 'number' ? parsed.data.duration : undefined,
+      alignmentEngine: parsed.data.alignmentEngine,
+    };
+  }
+
  const formData = new FormData();
  formData.append('video', videoFile);
  formData.append('targetLanguage', targetLanguage);
--- a/src/vite-env.d.ts
+++ b/src/vite-env.d.ts
@ -0,0 +1 @@
+/// <reference types="vite/client" />
--- a/start-dev.cmd
+++ b/start-dev.cmd
@ -1,4 +0,0 @@
-@echo off
-setlocal
-cd /d "%~dp0"
-node ".\node_modules\tsx\dist\cli.mjs" server.ts
--- a/vite.config.ts
+++ b/vite.config.ts
@ -5,6 +5,10 @@ import { defineConfig, loadEnv } from 'vite';

 export default defineConfig(({ mode }) => {
  const env = loadEnv(mode, '.', '');
+  const allowedHosts = env.VITE_ALLOWED_HOSTS
+    ? env.VITE_ALLOWED_HOSTS.split(',').map((host) => host.trim()).filter(Boolean)
+    : ['ced4302.r20.vip.cpolar.cn'];
+
  return {
    base: env.VITE_BASE_URL || '/',
    plugins: [react(), tailwindcss()],
@ -17,10 +21,10 @@ export default defineConfig(({ mode }) => {
      },
    },
    server: {
+      allowedHosts,
      // HMR is disabled in AI Studio via DISABLE_HMR env var.
-      // Do not modifyâfile watching is disabled to prevent flickering during agent edits.
+      // Do not modify. File watching is disabled to prevent flickering during agent edits.
      hmr: process.env.DISABLE_HMR !== 'true',
    },
  };
 });
-