Screen-VLA / App.tsx
Gemini
VLA Data Generator - Complete TypeScript/React app with backend
256cef9
import React, { useState, useCallback, useRef } from 'react';
import { VlaData, TaskSegment, Interaction } from './types';
import { generateOverallGoal, generateTasksAndInteractions } from './services/backendService';
import { extractFramesFromVideo } from './utils/videoProcessor';
import { VideoUploader } from './components/VideoUploader';
import { VideoPlayer } from './components/VideoPlayer';
import { ResultsDisplay } from './components/ResultsDisplay';
import { WandSparkles } from './components/Icons';
// Type for the point to be highlighted on the video
type HighlightPoint = { x: number; y: number; isEditing: boolean } | null;
// Type for the coordinate picker callback function
type CoordinatePickerCallback = ((coords: { x: number; y: number }) => void) | null;
export default function App(): React.ReactNode {
const [videoFile, setVideoFile] = useState<File | null>(null);
const [videoSrc, setVideoSrc] = useState<string | null>(null);
const [videoDuration, setVideoDuration] = useState<number>(0);
const [vlaData, setVlaData] = useState<VlaData | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(false);
const [loadingMessage, setLoadingMessage] = useState<string>('');
const [error, setError] = useState<string | null>(null);
const [totalFrames, setTotalFrames] = useState<number>(0);
const [highlightPoint, setHighlightPoint] = useState<HighlightPoint>(null);
const [coordinatePicker, setCoordinatePicker] = useState<CoordinatePickerCallback>(null);
const [usedFallback, setUsedFallback] = useState<boolean>(false);
const videoRef = useRef<HTMLVideoElement>(null);
const isGeneratingRef = useRef(false);
const handleVideoUpload = useCallback((file: File) => {
if (file.type.startsWith('video/')) {
setVideoFile(file);
if (videoSrc) {
URL.revokeObjectURL(videoSrc);
}
const url = URL.createObjectURL(file);
setVideoSrc(url);
setVlaData(null);
setError(null);
setUsedFallback(false);
setVideoDuration(0);
setTotalFrames(0);
const videoElement = document.createElement('video');
videoElement.preload = 'metadata';
videoElement.src = url;
videoElement.onloadedmetadata = () => {
setVideoDuration(videoElement.duration);
};
videoElement.onerror = () => {
setError("Could not read video metadata to get duration.");
};
} else {
setError('Please upload a valid video file.');
}
}, [videoSrc]);
const handleGenerate = useCallback(async () => {
if (!videoFile || isGeneratingRef.current) {
if (!videoFile) setError('No video file selected.');
return;
}
isGeneratingRef.current = true;
setIsLoading(true);
setError(null);
setVlaData(null);
setUsedFallback(false); // Reset on each generation
try {
const FRAMES_PER_SECOND = 2; // Extract 2 frames per second
const MAX_FRAMES_TOTAL = 360; // Cap at 360 frames (e.g., 3 minutes at 2fps) to manage memory/performance
let calculatedFrames = Math.ceil(videoDuration * FRAMES_PER_SECOND);
if (calculatedFrames > MAX_FRAMES_TOTAL) {
calculatedFrames = MAX_FRAMES_TOTAL;
}
if (calculatedFrames === 0 && videoDuration > 0) {
calculatedFrames = 1; // ensure at least one frame for very short videos
}
setTotalFrames(calculatedFrames);
// Step 1: Extract frames
setLoadingMessage(`Step 1/3: Extracting ${calculatedFrames} frames from video...`);
const frames = await extractFramesFromVideo(videoFile, calculatedFrames);
if (frames.length === 0) {
throw new Error("Could not extract any frames from the video. The file might be corrupted or in an unsupported format.");
}
// Step 2: Generate Overall Goal
setLoadingMessage('Step 2/3: Determining overall goal...');
const keyframes = [frames[0], frames[Math.floor(frames.length / 2)], frames[frames.length - 1]];
const overallGoal = await generateOverallGoal(keyframes, videoDuration);
const initialVlaData: VlaData = { overallGoal, tasks: [] };
setVlaData(initialVlaData);
// Step 3: Generate Task Segments and Interactions in one go
setLoadingMessage('Step 3/3: Analyzing tasks and interactions...');
const vlaData = await generateTasksAndInteractions(
frames,
overallGoal,
videoDuration,
totalFrames,
(current, total) => {
// Progress callback - you could update loading message here
console.log(`Progress: ${current}/${total}`);
}
);
setVlaData(vlaData);
} catch (err) {
console.error(err);
const errorMessage = err instanceof Error ? err.message : 'An unknown error occurred.';
setError(`Failed to process video. ${errorMessage}`);
setVlaData(null); // Clear partial data on major failure
} finally {
setIsLoading(false);
setLoadingMessage('');
isGeneratingRef.current = false;
}
}, [videoFile, videoDuration]);
const handleDownload = useCallback(() => {
if (!vlaData || !videoFile) return;
const dataStr = JSON.stringify(vlaData, null, 2);
const dataBlob = new Blob([dataStr], { type: 'application/json' });
const dataUrl = URL.createObjectURL(dataBlob);
const link = document.createElement('a');
link.href = dataUrl;
const baseName = videoFile.name.substring(0, videoFile.name.lastIndexOf('.')) || videoFile.name;
link.download = `${baseName}_vla_data.json`;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(dataUrl);
}, [vlaData, videoFile]);
const handleSeekToTime = useCallback((time: number) => {
if (videoRef.current) {
videoRef.current.currentTime = time;
}
}, []);
const handleUpdateInteraction = useCallback((taskId: number, interactionIndex: number, updatedInteraction: Interaction) => {
setVlaData(currentData => {
if (!currentData) return null;
const newTasks = currentData.tasks.map(task => {
if (task.id === taskId) {
const newInteractions = [...task.interactions];
newInteractions[interactionIndex] = updatedInteraction;
return { ...task, interactions: newInteractions };
}
return task;
});
return { ...currentData, tasks: newTasks };
});
}, []);
const handleVideoClick = useCallback((coords: { x: number; y: number }) => {
if (coordinatePicker) {
coordinatePicker(coords);
}
}, [coordinatePicker]);
const handleHighlightPoint = useCallback((point: HighlightPoint) => {
// Prevent hover from overriding a sticky editing highlight
if (highlightPoint?.isEditing && !point?.isEditing) {
return;
}
setHighlightPoint(point);
}, [highlightPoint?.isEditing]);
const handleSetCoordinatePicker = useCallback((callback: CoordinatePickerCallback) => {
setCoordinatePicker(() => callback);
}, []);
return (
<div className="min-h-screen bg-slate-900 text-slate-200 font-sans">
<main className="grid grid-cols-1 lg:grid-cols-2 gap-6 p-4 md:p-8 max-w-screen-2xl mx-auto">
{/* Left Column: Video and Controls */}
<div className="flex flex-col gap-6 lg:h-[calc(100vh-4rem)]">
<header>
<h1 className="text-3xl md:text-4xl font-bold text-white tracking-tight">VLA Data Generator</h1>
<p className="text-slate-400 mt-2">Upload a screen recording to automatically generate structured data about user actions.</p>
</header>
<div
className={`bg-slate-800/50 rounded-2xl p-2 aspect-video flex-grow flex items-center justify-center ${coordinatePicker ? 'cursor-crosshair' : ''}`}
>
{videoSrc ? (
<VideoPlayer
src={videoSrc}
ref={videoRef}
highlightPoint={highlightPoint}
onVideoClick={handleVideoClick}
/>
) : (
<VideoUploader onVideoSelect={handleVideoUpload} />
)}
</div>
{videoFile && (
<button
onClick={handleGenerate}
disabled={isLoading || videoDuration === 0}
className="w-full flex items-center justify-center gap-3 bg-indigo-600 hover:bg-indigo-500 disabled:bg-indigo-800 disabled:text-slate-400 disabled:cursor-not-allowed text-white font-bold py-3 px-4 rounded-xl transition-all duration-300 text-lg shadow-lg shadow-indigo-900/50"
>
<WandSparkles className="w-6 h-6" />
{isLoading ? 'Generating...' : (videoDuration === 0 ? 'Reading Video...' : 'Generate Action Data')}
</button>
)}
</div>
{/* Right Column: Results */}
<div className="bg-slate-800 rounded-2xl lg:h-[calc(100vh-4rem)] flex flex-col">
<ResultsDisplay
vlaData={vlaData}
isLoading={isLoading}
loadingMessage={loadingMessage}
error={error}
hasVideo={!!videoFile}
videoDuration={videoDuration}
totalFrames={totalFrames}
usedFallback={usedFallback}
onDownload={handleDownload}
onSeekToTime={handleSeekToTime}
onUpdateInteraction={handleUpdateInteraction}
onHighlightPoint={handleHighlightPoint}
onSetCoordinatePicker={handleSetCoordinatePicker}
/>
</div>
</main>
</div>
);
}