FLUX-1-schnell-mcp

Running on Zero

App Files Files Community

cbensimon HF Staff commited on 6 days ago

Commit

f0b5714

1 Parent(s): c79e236

PySpaces

Browse files

Files changed (21) hide show

.gitignore +1 -0
spaces/__init__.py +30 -0
spaces/config.py +53 -0
spaces/gradio.py +55 -0
spaces/utils.py +91 -0
spaces/zero/__init__.py +21 -0
spaces/zero/api.py +159 -0
spaces/zero/client.py +274 -0
spaces/zero/decorator.py +113 -0
spaces/zero/gradio.py +190 -0
spaces/zero/torch/__init__.py +42 -0
spaces/zero/torch/bitsandbytes.py +170 -0
spaces/zero/torch/cudart.py +8 -0
spaces/zero/torch/packing.py +209 -0
spaces/zero/torch/patching.py +410 -0
spaces/zero/torch/patching_legacy.py +266 -0
spaces/zero/torch/static.py +136 -0
spaces/zero/torch/types.py +23 -0
spaces/zero/tqdm.py +24 -0
spaces/zero/types.py +50 -0
spaces/zero/wrappers.py +423 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pyc

spaces/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+"""
+import sys
+if sys.version_info.minor < 8: # pragma: no cover
+    raise RuntimeError("Importing PySpaces requires Python 3.8+")
+# Prevent gradio from importing spaces
+if (gr := sys.modules.get('gradio')) is not None: # pragma: no cover
+    try:
+        gr.Blocks
+    except AttributeError:
+        raise ImportError
+from .zero.decorator import GPU
+from .gradio import gradio_auto_wrap
+from .gradio import disable_gradio_auto_wrap
+from .gradio import enable_gradio_auto_wrap
+__all__ = [
+    'GPU',
+    'gradio_auto_wrap',
+    'disable_gradio_auto_wrap',
+    'enable_gradio_auto_wrap',
+]

spaces/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+ZEROGPU_OFFLOAD_DIR_DEFAULT = str(Path.home() / '.zerogpu' / 'tensors')
+def boolean(value: str | None) -> bool:
+    return value is not None and value.lower() in ("1", "t", "true")
+class Settings:
+    def __init__(self):
+        self.zero_gpu = boolean(
+            os.getenv('SPACES_ZERO_GPU'))
+        self.zero_device_api_url = (
+            os.getenv('SPACES_ZERO_DEVICE_API_URL'))
+        self.gradio_auto_wrap = boolean(
+            os.getenv('SPACES_GRADIO_AUTO_WRAP'))
+        self.zero_patch_torch_device = boolean(
+            os.getenv('ZERO_GPU_PATCH_TORCH_DEVICE'))
+        self.zero_gpu_v2 = boolean(
+            os.getenv('ZEROGPU_V2'))
+        self.zerogpu_offload_dir = (
+            os.getenv('ZEROGPU_OFFLOAD_DIR', ZEROGPU_OFFLOAD_DIR_DEFAULT))
+        self.zerogpu_proc_self_cgroup_path = (
+            os.getenv('ZEROGPU_PROC_SELF_CGROUP_PATH', '/proc/self/cgroup'))
+        self.zerogpu_cuda_device_name = (
+            os.getenv('ZEROGPU_CUDA_DEVICE_NAME', "NVIDIA H200 MIG 3g.71gb"))
+        self.zerogpu_cuda_total_memory = int(
+            os.getenv('ZEROGPU_CUDA_TOTAL_MEMORY', 74625056768))
+        self.zerogpu_cuda_reserved_memory = int(
+            os.getenv('ZEROGPU_CUDA_RESERVED_MEMORY', 0))
+        self.zerogpu_cuda_capability_major = int(
+            os.getenv('ZEROGPU_CUDA_CAPABILITY_MAJOR', 9))
+        self.zerogpu_cuda_capability_minor = int(
+            os.getenv('ZEROGPU_CUDA_CAPABILITY_MINOR', 0))
+        self.zerogpu_cuda_multi_processor_count = int(
+            os.getenv('ZEROGPU_CUDA_MULTI_PROCESSOR_COUNT', 60))
+Config = Settings()
+if Config.zero_gpu:
+    assert Config.zero_device_api_url is not None, (
+        'SPACES_ZERO_DEVICE_API_URL env must be set '
+        'on ZeroGPU Spaces (identified by SPACES_ZERO_GPU=true)'
+    )

spaces/gradio.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+"""
+from __future__ import annotations
+from typing import Callable
+from typing import Generator
+from typing import TypeVar
+from typing import overload
+from typing_extensions import ParamSpec
+from .config import Config
+from .zero.decorator import GPU
+Param = ParamSpec('Param')
+Res = TypeVar('Res')
+gradio_auto_wrap_enabled = Config.gradio_auto_wrap
+def disable_gradio_auto_wrap():
+    global gradio_auto_wrap_enabled
+    gradio_auto_wrap_enabled = False
+def enable_gradio_auto_wrap():
+    global gradio_auto_wrap_enabled
+    gradio_auto_wrap_enabled = True
+@overload
+def gradio_auto_wrap(
+    task:
+     Callable[Param, Res],
+) -> Callable[Param, Res]:
+    ...
+@overload
+def gradio_auto_wrap(
+    task:
+     None,
+) -> None:
+    ...
+def gradio_auto_wrap(
+    task:
+      Callable[Param, Res]
+    | None,
+) -> (Callable[Param, Res]
+    | None):
+    """
+    """
+    if not gradio_auto_wrap_enabled:
+        return task
+    if not callable(task):
+        return task
+    return GPU(task) # type: ignore

spaces/utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+"""
+from __future__ import annotations
+import base64
+import ctypes
+import json
+import sys
+from functools import lru_cache as cache
+from functools import partial
+from typing import Any
+import multiprocessing
+from multiprocessing.queues import SimpleQueue as _SimpleQueue
+from pathlib import Path
+from pickle import PicklingError
+from typing import Callable
+from typing import TypeVar
+from .config import Config
+GRADIO_VERSION_ERROR_MESSAGE = "Make sure Gradio version is at least 3.46"
+T = TypeVar('T')
+@cache
+def self_cgroup_device_path() -> str:
+    cgroup_content = Path(Config.zerogpu_proc_self_cgroup_path).read_text()
+    for line in cgroup_content.strip().split('\n'):
+        contents = line.split(':devices:')
+        if len(contents) != 2:
+            continue # pragma: no cover
+        return contents[1]
+    raise Exception # pragma: no cover
+if sys.version_info.minor < 9: # pragma: no cover
+    _SimpleQueue.__class_getitem__ = classmethod(lambda cls, _: cls) # type: ignore
+class SimpleQueue(_SimpleQueue[T]):
+    def __init__(self, *args):
+        super().__init__(*args, ctx=multiprocessing.get_context('fork'))
+    def put(self, obj: T):
+        try:
+            super().put(obj)
+        except PicklingError:
+            raise # pragma: no cover
+        # https://bugs.python.org/issue29187
+        except Exception as e:
+            message = str(e)
+            if not "pickle" in message:
+                raise # pragma: no cover
+            raise PicklingError(message)
+    def close(self): # Python 3.8 static typing trick
+        super().close() # type: ignore
+    def wlock_release(self):
+        if (lock := getattr(self, '_wlock', None)) is None:
+            return # pragma: no cover
+        try:
+            lock.release()
+        except ValueError:
+            pass
+def drop_params(fn: Callable[[], T]) -> Callable[..., T]:
+    def drop(*args):
+        return fn()
+    return drop
+def gradio_request_var():
+    try:
+        from gradio.context import LocalContext
+    except ImportError: # pragma: no cover
+        raise RuntimeError(GRADIO_VERSION_ERROR_MESSAGE)
+    return LocalContext.request
+def malloc_trim():
+    ctypes.CDLL("libc.so.6").malloc_trim(0)
+debug = partial(print, 'SPACES_ZERO_GPU_DEBUG')
+def jwt_payload(token: str) -> dict[str, Any]:
+    _, payload, _ = token.split('.')
+    return json.loads(base64.urlsafe_b64decode(f'{payload}=='))

spaces/zero/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+"""
+from pathlib import Path
+from ..config import Config
+if Config.zero_gpu:
+    from . import gradio
+    from . import torch
+    if torch.is_in_bad_fork():
+        raise RuntimeError(
+            "CUDA has been initialized before importing the `spaces` package"
+        )
+    torch.patch()
+    gradio.one_launch(torch.pack)
+    Path(Config.zerogpu_offload_dir).mkdir(parents=True, exist_ok=True)

spaces/zero/api.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Synced with huggingface/pyspaces:spaces/zero/api.py
+"""
+from __future__ import annotations
+from datetime import timedelta
+from typing import Any
+from typing import Generator
+from typing import Literal
+from typing import NamedTuple
+from typing import Optional
+from typing import overload
+import httpx
+from pydantic import BaseModel
+from typing_extensions import assert_never
+AllowToken = str
+NvidiaIndex = int # TODO: Migrate to GpuIndex (less confusing for MIG)
+NvidiaUUID = str
+CGroupPath = str
+TaskId = int
+AuthLevel = Literal['regular', 'pro']
+QueuingReason = Literal['node', 'concurrency']
+AUTHENTICATED_HEADER = 'X-Authenticated'
+QUEUING_REASON_HEADER = 'X-Queuing-Reason'
+class ScheduleResponse(BaseModel):
+    idle: bool
+    nvidiaIndex: int
+    nvidiaUUID: str
+    allowToken: str
+class ScheduleMetadata(BaseModel):
+    auth: Optional[AuthLevel] = None
+    queuing_reason: Optional[QueuingReason] = None
+class QuotaInfos(BaseModel):
+    left: int
+    wait: timedelta
+class QueueEvent(BaseModel):
+    event: Literal['ping', 'failed', 'succeeded']
+    data: Optional[ScheduleResponse] = None
+def sse_parse(text: str):
+    event, *data = text.strip().splitlines()
+    assert event.startswith('event:')
+    event = event[6:].strip()
+    if event in ('ping', 'failed'):
+        return QueueEvent(event=event)
+    assert event == 'succeeded'
+    (data,) = data
+    assert data.startswith('data:')
+    data = data[5:].strip()
+    return QueueEvent(event=event, data=ScheduleResponse.parse_raw(data))
+def sse_stream(res: httpx.Response) -> Generator[QueueEvent, Any, None]:
+    for text in res.iter_text():
+        if len(text) == 0:
+            break # pragma: no cover
+        try:
+            yield sse_parse(text)
+        except GeneratorExit:
+            res.close()
+            break
+class APIClient:
+    def __init__(self, client: httpx.Client):
+        self.client = client
+    def startup_report(self) -> httpx.codes:
+        res = self.client.post('/startup-report')
+        return httpx.codes(res.status_code)
+    def schedule(
+        self,
+        cgroup_path: str,
+        task_id: int = 0,
+        token: str | None = None,
+        token_version: int = 1,
+        duration_seconds: int | None = None,
+        enable_queue: bool = True,
+    ):
+        params: dict[str, str | int | bool] = {
+            'cgroupPath': cgroup_path,
+            'taskId': task_id,
+            'enableQueue': enable_queue,
+            'tokenVersion': token_version,
+        }
+        if duration_seconds is not None:
+            params['durationSeconds'] = duration_seconds
+        if token is not None:
+            params['token'] = token
+        res = self.client.send(
+            request=self.client.build_request(
+                method='POST',
+                url='/schedule',
+                params=params,
+            ),
+            stream=True,
+        )
+        status = httpx.codes(res.status_code)
+        auth: AuthLevel | None = res.headers.get(AUTHENTICATED_HEADER)
+        queuing_reason: QueuingReason | None = res.headers.get(QUEUING_REASON_HEADER)
+        metadata = ScheduleMetadata(auth=auth, queuing_reason=queuing_reason)
+        if (status is not httpx.codes.OK and
+            status is not httpx.codes.TOO_MANY_REQUESTS
+        ):
+            res.close()
+            return status, metadata
+        if "text/event-stream" in res.headers['content-type']:
+            return sse_stream(res), metadata
+        res.read()
+        if status is httpx.codes.TOO_MANY_REQUESTS:
+            return QuotaInfos(**res.json()), metadata # pragma: no cover
+        if status is httpx.codes.OK:
+            return ScheduleResponse(**res.json()), metadata
+        assert_never(status)
+    def allow(
+        self,
+        allow_token: str,
+        pid: int,
+    ):
+        res = self.client.post('/allow', params={
+            'allowToken': allow_token,
+            'pid': pid,
+        })
+        return httpx.codes(res.status_code)
+    def release(
+        self,
+        allow_token: str,
+        fail: bool = False,
+    ) -> httpx.codes:
+        res = self.client.post('/release', params={
+            'allowToken': allow_token,
+            'fail': fail,
+        })
+        return httpx.codes(res.status_code)
+    def get_queue_size(self) -> float:
+        res = self.client.get('/queue-size')
+        assert res.status_code == 200, res.status_code
+        size = res.json()
+        return size

spaces/zero/client.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+"""
+from __future__ import annotations
+import os
+import time
+import warnings
+from datetime import timedelta
+from typing import Any
+import gradio as gr
+import httpx
+from packaging import version
+from typing_extensions import assert_never
+from .. import utils
+from ..config import Config
+from .api import APIClient
+from .api import AuthLevel
+from .api import QuotaInfos
+from .api import ScheduleResponse
+from .gradio import info
+from .gradio import error
+from .gradio import get_event
+from .gradio import supports_auth
+TOKEN_HEADER = 'X-IP-Token'
+DEFAULT_SCHEDULE_DURATION = 60
+UNUSED_MESSAGE = "GPU device not used"
+NO_GPU_MESSAGE_REGULAR = "No GPU was available"
+NO_GPU_MESSAGE_INQUEUE = "No GPU was available after 60s"
+EXAMPLES_RETRY_MESSAGE = "Try re-running outside of examples if it happened after clicking one"
+SIGNUP_ON_HF_TXT = "Create a free account"
+SIGNUP_ON_HF_URL = "https://huggingface.co/join"
+SUBSCRIBE_TO_PRO_TXT = "Subscribe to Pro"
+SUBSCRIBE_TO_PRO_URL = "https://huggingface.co/settings/billing/subscription"
+def api_client():
+    assert Config.zero_device_api_url is not None
+    httpx_client = httpx.Client(base_url=Config.zero_device_api_url, timeout=60, verify=False)
+    return APIClient(httpx_client)
+def startup_report():
+    retries, max_retries = 0, 2
+    client = api_client()
+    while (status := client.startup_report()) is httpx.codes.NOT_FOUND: # pragma: no cover
+        time.sleep(1)
+        if (retries := retries + 1) > max_retries:
+            raise RuntimeError("Error while initializing ZeroGPU: NotFound")
+    if status is not httpx.codes.OK: # pragma: no cover
+        raise RuntimeError("Error while initializing ZeroGPU: Unknown")
+def html_string(html_contents: str, text_contents: str): # pragma: no cover
+    class HTMLString(str):
+        def __str__(self):
+            return text_contents
+    return HTMLString(html_contents)
+def _toast_action(
+    auth: AuthLevel | None,
+    supports_html: bool,
+    pro_message: str,
+    unlogged_desc: str,
+    logged_desc: str,
+    ending: str,
+) -> tuple[str, str]: # pragma: no cover
+    if not supports_auth() or auth == 'pro':
+        return pro_message, pro_message
+    html = ""
+    link = SIGNUP_ON_HF_URL if auth is None else SUBSCRIBE_TO_PRO_URL
+    text = SIGNUP_ON_HF_TXT if auth is None else SUBSCRIBE_TO_PRO_TXT
+    desc = unlogged_desc if auth is None else logged_desc
+    desc += f" {ending}."
+    style = ";".join([
+        "white-space: nowrap",
+        "text-underline-offset: 2px",
+        "color: var(--body-text-color)",
+    ])
+    if supports_html:
+        html += f'<a style="{style}" href="{link}">'
+    html += text
+    if supports_html:
+        html += '</a>'
+    html += f" {desc}"
+    markdown = f'[{text}]({link}) {desc}'
+    return html, markdown
+def schedule(
+    task_id: int,
+    request: gr.Request | None = None,
+    duration: timedelta | None = None,
+    _first_attempt: bool = True,
+) -> ScheduleResponse:
+    if not (gradio_version := version.parse(gr.__version__)).major >= 4: # pragma: no cover
+        raise RuntimeError("ZeroGPU is only compatible with Gradio 4+")
+    GRADIO_HTML_TOASTS = gradio_version >= version.Version('4.39')
+    GRADIO_HANDSHAKE = gradio_version >= version.Version('5.16.1')
+    token, payload = _get_token_and_payload(request)
+    if token is not None and (token_error := payload.get('error')):
+        message = f"Falling back to IP-based quotas ({token_error})"
+        info("ZeroGPU client warning", message, level='warning')
+    res, meta = api_client().schedule(
+        cgroup_path=utils.self_cgroup_device_path(),
+        task_id=task_id,
+        token=token,
+        token_version=2 if GRADIO_HANDSHAKE else 1,
+        duration_seconds=duration.seconds if duration is not None else None,
+    )
+    auth = meta.auth
+    if isinstance(res, ScheduleResponse):
+        return res
+    if isinstance(res, QuotaInfos): # pragma: no cover
+        requested = duration.seconds if duration is not None else DEFAULT_SCHEDULE_DURATION
+        if res.wait < timedelta(0):
+            message = (
+                f"The requested GPU duration ({requested}s) "
+                f"is larger than the maximum allowed"
+            )
+            raise error("ZeroGPU illegal duration", message)
+        elif token is None:
+            message = (
+                f"Space app has reached its GPU limit. "
+                f"{EXAMPLES_RETRY_MESSAGE}"
+            )
+            raise error("ZeroGPU quota exceeded", message)
+        else:
+            if payload.get('user') is None and res.wait == 0:
+                message = "You have exceeded your runs limit."
+            else:
+                gpu = "Pro GPU" if auth == 'pro' else ("free GPU" if auth == 'regular' else "GPU")
+                message = (
+                    f"You have exceeded your {gpu} quota "
+                    f"({requested}s requested vs. {res.left}s left). "
+                    f"Try again in {res.wait}"
+                )
+            raise error("ZeroGPU quota exceeded", message)
+    if not isinstance(res, httpx.codes): # pragma: no cover
+        if meta.queuing_reason in ('node', None):
+            info("ZeroGPU queue", "Waiting for a GPU to become available")
+        elif meta.queuing_reason == 'concurrency':
+            info("ZeroGPU queue", "Waiting for a GPU slot on this Space")
+        else:
+            assert_never(meta.queuing_reason)
+        # TODO: Sign-up message if not authenticated (after some time ?)
+        connection_event = get_event()
+        if connection_event is None and request is not None:
+            warnings.warn("ZeroGPU: Cannot get Gradio app Queue instance")
+        while True:
+            try:
+                event = next(res)
+            except StopIteration:
+                raise RuntimeError("Unexpected end of stream")
+            except httpx.RemoteProtocolError:
+                if not _first_attempt:
+                    raise RuntimeError("Error while re-trying after queue disconnect")
+                return schedule(task_id, request, duration, _first_attempt=False)
+            if event.event == 'ping':
+                if connection_event is not None and not connection_event.alive:
+                    res.close()
+                    raise RuntimeError("Connection closed by visitor while queueing")
+                continue
+            if event.event == 'failed':
+                if token is None:
+                    message = f"{NO_GPU_MESSAGE_INQUEUE}. {EXAMPLES_RETRY_MESSAGE}"
+                    raise error("ZeroGPU quota exceeded", message)
+                details_html, details_markdown = _toast_action(
+                    auth=auth,
+                    supports_html=GRADIO_HTML_TOASTS,
+                    pro_message="Retry later",
+                    unlogged_desc="to get a higher",
+                    logged_desc="to get the highest",
+                    ending="priority in ZeroGPU queues",
+                )
+                message_html = f"{NO_GPU_MESSAGE_INQUEUE}. {details_html}"
+                message_text = f"{NO_GPU_MESSAGE_INQUEUE} {details_markdown}"
+                message = html_string(message_html, message_text)
+                raise error("ZeroGPU queue timeout", message, html=True)
+            if event.event == 'succeeded':
+                assert event.data is not None
+                if connection_event is not None and not connection_event.alive:
+                    release(event.data.allowToken)
+                    raise RuntimeError("Connection closed by visitor on queue success")
+                info("ZeroGPU queue", "Successfully acquired a GPU", level='success')
+                return event.data
+    if res is httpx.codes.SERVICE_UNAVAILABLE:
+        raise error("ZeroGPU client error", NO_GPU_MESSAGE_REGULAR)
+    if res is httpx.codes.UNAUTHORIZED: # pragma: no cover
+        raise error("ZeroGPU client error", "Expired ZeroGPU proxy token")
+    # TODO: Find a way to log 'detail' response field
+    raise RuntimeError(f"ZeroGPU API /schedule error: {res} ({httpx.codes.get_reason_phrase(res)})") # pragma: no cover
+def allow(allow_token: str) -> None:
+    pid = os.getpid()
+    assert pid != 1, "Allowing PID 1 on ZeroGPU will end up killing your Space"
+    assert api_client().allow(allow_token=allow_token, pid=pid) is httpx.codes.OK
+def release(
+    allow_token: str, *,
+    fail: bool = False,
+    allow_404: bool = False,
+) -> None:
+    res = api_client().release(
+        allow_token=allow_token,
+        fail=fail,
+    )
+    if res is httpx.codes.NO_CONTENT: # pragma: no cover
+        try:
+            info("ZeroGPU client warning", UNUSED_MESSAGE, level='warning')
+        except AttributeError:
+            pass
+        warnings.warn(UNUSED_MESSAGE, RuntimeWarning)
+        return None
+    if res is httpx.codes.NOT_FOUND:
+        if not allow_404:
+            warnings.warn("ZeroGPU API /release warning: 404 Not Found")
+        return None
+    if httpx.codes.is_success(res):
+        return None
+    # TODO: Find a way to log 'detail' response field
+    # TODO: Only raise in dev environment. Simply warn in production ?
+    raise RuntimeError(f"ZeroGPU API /release error: {res} ({httpx.codes.get_reason_phrase(res)})") # pragma: no cover
+def _get_token(request: gr.Request | None) -> str | None:
+    if request is None:
+        return None
+    headers = getattr(request, 'headers', None)
+    if headers is None or not hasattr(headers, '__dict__'):
+        raise error("ZeroGPU client error", "Internal Gradio error")
+    # Compatibility trick
+    if not hasattr(headers, 'get'):
+        headers = headers.__dict__ # pragma: no cover
+    return headers.get(TOKEN_HEADER.lower())
+def _get_token_and_payload(request: gr.Request | None) -> tuple[str | None, dict[str, Any]]:
+    if (token := _get_token(request)) is None:
+        return None, {}
+    try:
+        payload = utils.jwt_payload(token)
+    except Exception: # pragma: no cover
+        warnings.warn("Error while decoding X-IP-Token JWT")
+        return token, {}
+    return token, payload

spaces/zero/decorator.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+"""
+from __future__ import annotations
+import inspect
+import sys
+import warnings
+from datetime import timedelta
+from functools import partial
+from typing import Callable
+from typing import TypeVar
+from typing import overload
+from typing_extensions import ParamSpec
+from typing_extensions import Unpack
+from ..config import Config
+from .types import DynamicDuration
+from .types import EmptyKwargs
+P = ParamSpec('P')
+R = TypeVar('R')
+decorated_cache: dict[Callable, Callable] = {}
+@overload
+def GPU(
+    task: None = None, *,
+    duration: DynamicDuration[P] = None,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    ...
+@overload
+def GPU(
+    task: Callable[P, R], *,
+    duration: DynamicDuration[P] = None,
+) -> Callable[P, R]:
+    ...
+def GPU(
+    task: Callable[P, R] | None = None, *,
+    duration: DynamicDuration[P] = None,
+    **kwargs: Unpack[EmptyKwargs],
+) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]:
+    """
+    ZeroGPU decorator
+    Basic usage:
+        ```
+        @spaces.GPU
+        def fn(...):
+            # CUDA is available here
+            pass
+        ```
+    With custom duration:
+        ```
+        @spaces.GPU(duration=45) # Expressed in seconds
+        def fn(...):
+            # CUDA is available here
+            pass
+        ```
+    Args:
+        task (`Callable | None`): Python function that requires CUDA
+        duration (`int | datetime.timedelta`): Estimated duration in seconds or `datetime.timedelta`
+    Returns:
+        `Callable`: GPU-ready function
+    """
+    if "enable_queue" in kwargs:
+        warnings.warn("`enable_queue` parameter is now ignored and always set to `True`")
+    if task is None:
+        return partial(_GPU, duration=duration)
+    return _GPU(task, duration)
+def _GPU(
+    task: Callable[P, R],
+    duration: DynamicDuration[P],
+) -> Callable[P, R]:
+    if not Config.zero_gpu:
+        return task
+    from . import client
+    from .wrappers import regular_function_wrapper
+    from .wrappers import generator_function_wrapper
+    if sys.version_info.minor < 9: # pragma: no cover
+        raise RuntimeError("Actually using @spaces.GPU on a ZeroGPU Space requires Python 3.9+")
+    if task in decorated_cache:
+        # TODO: Assert same duration ?
+        return decorated_cache[task] # type: ignore
+    if inspect.iscoroutinefunction(task):
+        raise NotImplementedError
+    if inspect.isgeneratorfunction(task):
+        decorated = generator_function_wrapper(task, duration)
+    else:
+        decorated = regular_function_wrapper(task, duration)
+    setattr(decorated, 'zerogpu', None)
+    client.startup_report()
+    decorated_cache.update({
+        task:      decorated,
+        decorated: decorated,
+    })
+    return decorated # type: ignore

spaces/zero/gradio.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+"""
+from __future__ import annotations
+import inspect
+from functools import wraps
+from packaging import version
+from typing import Any
+from typing import Callable
+from typing import Literal
+from typing import NamedTuple
+from typing import TYPE_CHECKING
+import warnings
+import gradio as gr
+from gradio.context import Context
+from gradio.context import LocalContext
+from gradio.helpers import Progress
+from gradio.helpers import TrackedIterable
+from gradio.queueing import Queue
+from typing_extensions import ParamSpec
+from typing_extensions import TypeAlias
+from ..utils import SimpleQueue
+from .types import GeneratorResQueueResult
+from .types import GradioQueueEvent
+from .types import RegularResQueueResult
+QUEUE_RPC_METHODS = [
+    "set_progress",
+    "log_message",
+]
+try:
+    Success = gr.Success # pyright: ignore[reportAttributeAccessIssue] (Gradio<5.10)
+except AttributeError: # pragma: no cover
+    Success = gr.Info
+Level: TypeAlias = "Literal['success', 'info', 'warning']"
+def modal(level: Level):
+    if level == 'info':
+        return gr.Info
+    if level == 'success':
+        return Success
+    if level == 'warning':
+        return gr.Warning
+class GradioPartialContext(NamedTuple):
+    event_id: str | None
+    in_event_listener: bool
+    progress: Progress | None
+    @staticmethod
+    def get():
+        TrackedIterable.__reduce__ = tracked_iterable__reduce__
+        return GradioPartialContext(
+            event_id=LocalContext.event_id.get(),
+            in_event_listener=LocalContext.in_event_listener.get(),
+            progress=LocalContext.progress.get(),
+        )
+    @staticmethod
+    def apply(context: 'GradioPartialContext'):
+        LocalContext.event_id.set(context.event_id)
+        LocalContext.in_event_listener.set(context.in_event_listener)
+        LocalContext.progress.set(context.progress)
+def get_queue_instance():
+    blocks = LocalContext.blocks.get()
+    if blocks is None: # pragma: no cover
+        return None
+    return blocks._queue
+def get_event():
+    queue = get_queue_instance()
+    event_id = LocalContext.event_id.get()
+    if queue is None:
+        return None
+    if event_id is None: # pragma: no cover
+        return None
+    for job in queue.active_jobs:
+        if job is None: # pragma: no cover
+            continue
+        for event in job:
+            if event._id == event_id:
+                return event
+def get_server_port() -> int | None:
+    from_request_context = True
+    if (blocks := LocalContext.blocks.get()) is None: # Request
+        from_request_context = False
+        if (blocks := Context.root_block) is None: # Caching
+            return None
+    if (server := getattr(blocks, 'server', None)) is None: # pragma: no cover (Gradio 4)
+        if from_request_context:
+            warnings.warn("Gradio: No blocks.server inside a request") # pragma: no cover
+        return -1
+    if TYPE_CHECKING:
+        assert (server := blocks.server)
+    return server.config.port
+def try_process_queue_event(method_name: str, *args, **kwargs):
+    queue = get_queue_instance()
+    if queue is None: # pragma: no cover
+        warnings.warn("ZeroGPU: Cannot get Gradio app Queue instance")
+        return
+    method = getattr(queue, method_name, None)
+    assert callable(method)
+    method(*args, **kwargs)
+def patch_gradio_queue(
+    res_queue: SimpleQueue[RegularResQueueResult | None] | SimpleQueue[GeneratorResQueueResult | None],
+):
+    def rpc_method(method_name: str):
+        def method(*args, **kwargs):
+            if args and isinstance(args[0], Queue):
+                args = args[1:] # drop `self`
+            res_queue.put(GradioQueueEvent(method_name, args, kwargs))
+        return method
+    for method_name in QUEUE_RPC_METHODS:
+        if (method := getattr(Queue, method_name, None)) is None: # pragma: no cover
+            warnings.warn(f"ZeroGPU: Gradio Queue has no {method_name} attribute")
+            continue
+        if not callable(method): # pragma: no cover
+            warnings.warn(f"ZeroGPU: Gradio Queue {method_name} is not callable")
+            continue
+        setattr(Queue, method_name, rpc_method(method_name))
+    TrackedIterable.__reduce__ = tracked_iterable__reduce__
+def tracked_iterable__reduce__(self):
+    res: tuple = super(TrackedIterable, self).__reduce__() # type: ignore
+    cls, base, state, *_ = res
+    return cls, base,{**state, **{
+        'iterable': None,
+        '_tqdm': None,
+    }}
+def supports_auth():
+    return version.parse(gr.__version__) >= version.Version('4.27.0')
+Param = ParamSpec('Param')
+def one_launch(task: Callable[Param, None], *task_args: Param.args, **task_kwargs: Param.kwargs):
+    _launch = gr.Blocks.launch
+    @wraps(gr.Blocks.launch)
+    def launch(*args, **kwargs):
+        task(*task_args, **task_kwargs)
+        gr.Blocks.launch = _launch
+        return gr.Blocks.launch(*args, **kwargs)
+    gr.Blocks.launch = launch
+class HTMLError(gr.Error):
+    def __str__(self): # pragma: no cover
+        return self.message
+def error(title: str, message: str, html: bool = False):
+    params = inspect.signature(gr.Error).parameters
+    kwargs: dict[str, Any] = {}
+    if 'title' in params:
+        kwargs = {**kwargs, 'title': title}
+    if 'print_exception' in params:
+        kwargs = {**kwargs, 'print_exception': False}
+    error_cls = HTMLError if html else gr.Error
+    return error_cls(message, **kwargs)
+def info(title: str, message: str, level: Level = 'info'):
+    params = inspect.signature(gr.Info).parameters
+    kwargs: dict[str, Any] = {}
+    if 'title' in params:
+        kwargs = {**kwargs, 'title': title}
+    info_cls = modal(level)
+    return info_cls(message, **kwargs)

spaces/zero/torch/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+"""
+from ...config import Config
+try:
+    import torch
+except ImportError:
+    _patch = lambda *args, **kwargs: None
+    _unpatch = lambda *args, **kwargs: None
+    _pack = lambda *args, **kwargs: None
+    _init = lambda *args, **kwargs: None
+    _size = lambda *args, **kwargs: 0
+    _move = lambda *args, **kwargs: None
+    _is_in_bad_fork = lambda *args, **kwargs: False
+else:
+    if Config.zero_gpu_v2:
+        from . import patching as _patching
+    else: # pragma: no cover
+        from . import patching_legacy as _patching
+    _patch = _patching.patch
+    _unpatch = _patching.unpatch
+    _pack = _patching.pack
+    _init = _patching.init
+    _size = _patching.size
+    _move = _patching.move
+    _is_in_bad_fork = _patching.is_in_bad_fork
+patch = _patch
+unpatch = _unpatch
+pack = _pack
+init = _init
+size = _size
+move = _move
+is_in_bad_fork = _is_in_bad_fork

spaces/zero/torch/bitsandbytes.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+"""
+# pyright: reportPrivateImportUsage=false
+from __future__ import annotations
+import importlib
+from contextlib import contextmanager
+from contextlib import nullcontext
+from importlib import metadata
+from types import ModuleType
+from typing import TYPE_CHECKING
+from typing import Tuple
+import torch
+from packaging import version
+if TYPE_CHECKING:
+    import torch as Torch
+@contextmanager
+def cuda_unavailable(torch: ModuleType): # pragma: no cover
+    _is_available = torch.cuda.is_available
+    torch.cuda.is_available = lambda: False
+    yield
+    torch.cuda.is_available = _is_available
+def maybe_import_bitsandbytes():
+    try:
+        import torch
+    except ImportError: # pragma: no cover
+        return None
+    try:
+        bnb_version = version.parse(metadata.version('bitsandbytes'))
+    except ImportError: # pragma: no cover
+        return None
+    if bnb_version < version.parse('0.40.0'): # pragma: no cover
+        raise RuntimeError(f"ZeroGPU requires bitsandbytes >= 0.40.0 (installed: {bnb_version})")
+    if bnb_version < version.parse('0.43.1'): # pragma: no cover
+        context = lambda: cuda_unavailable(torch)
+    else:
+        context = lambda: nullcontext()
+    with (ctx := context()):
+        try:
+            import bitsandbytes
+        except ImportError:
+            return None
+        if not isinstance(ctx, nullcontext): # pragma: no cover
+            print("↑ Those bitsandbytes warnings are expected on ZeroGPU ↑")
+    return context
+if (import_context := maybe_import_bitsandbytes()):
+    from torch.utils.weak import WeakTensorKeyDictionary
+    with (import_ctx := import_context()):
+        CUDASetup = None
+        if not isinstance(import_ctx, nullcontext): # pragma: no cover
+            from bitsandbytes.cuda_setup.main import CUDASetup # pyright: ignore [reportMissingImports]
+        from bitsandbytes import cextension
+        from bitsandbytes import functional
+        from bitsandbytes.nn import Int8Params
+        from bitsandbytes.nn import Params4bit
+    _param_to_8bit   = Int8Params.to     # type: ignore
+    _param_cuda_8bit = Int8Params.cuda
+    _param_to_4bit   = Params4bit.to     # type: ignore
+    _param_cuda_4bit = Params4bit.cuda
+    TensorToArgs = Tuple[torch.device, torch.dtype, bool, torch.memory_format]
+    to_ops_8bit: dict[Int8Params, TensorToArgs | None] = WeakTensorKeyDictionary() # type: ignore
+    to_ops_4bit: dict[Params4bit, TensorToArgs | None] = WeakTensorKeyDictionary() # type: ignore
+    def _to_op_register_8bit(self: Int8Params, *args, **kwargs):
+        parsed = torch._C._nn._parse_to(*args, **kwargs)
+        device, *_ = parsed
+        if not isinstance(device, torch.device): # pragma: no cover
+            return _param_to_8bit(self, *args, **kwargs)
+        if device.type != 'cuda':
+            return _param_to_8bit(self, *args, **kwargs)
+        to_ops_8bit[self] = parsed
+        return self
+    def _to_op_register_4bit(self: Params4bit, *args, **kwargs):
+        parsed = torch._C._nn._parse_to(*args, **kwargs)
+        device, *_ = parsed
+        if not isinstance(device, torch.device): # pragma: no cover
+            return _param_to_4bit(self, *args, **kwargs)
+        if device.type != 'cuda':
+            return _param_to_4bit(self, *args, **kwargs)
+        to_ops_4bit[self] = parsed
+        return self
+    def _cuda_op_arg_check(device: Torch.device | int | str | None) -> bool:
+        if device is None: # pragma: no cover
+            return True
+        if isinstance(device, int):
+            return True
+        if isinstance(device, str): # pragma: no cover
+            device = torch.device(device)
+        return device.type == 'cuda' # pragma: no cover
+    def _cuda_op_register_8bit(self: Int8Params, device: Torch.device | int | str | None = None, **kwargs):
+        if not _cuda_op_arg_check(device): # pragma: no cover
+            # Let PyTorch handle the fail
+            return _param_cuda_8bit(self, device, **kwargs)
+        to_ops_8bit[self] = None
+        return self
+    def _cuda_op_register_4bit(self: Params4bit, device: Torch.device | int | str | None = None, **kwargs):
+        if not _cuda_op_arg_check(device): # pragma: no cover
+            # Let PyTorch handle the fail
+            return _param_cuda_4bit(self, device, **kwargs)
+        to_ops_4bit[self] = None
+        return self
+    def _patch():
+        Int8Params.to   = _to_op_register_8bit   # type: ignore
+        Int8Params.cuda = _cuda_op_register_8bit # type: ignore
+        Params4bit.to   = _to_op_register_4bit   # type: ignore
+        Params4bit.cuda = _cuda_op_register_4bit # type: ignore
+    def _unpatch():
+        Int8Params.to   = _param_to_8bit   # type: ignore
+        Int8Params.cuda = _param_cuda_8bit
+        Params4bit.to   = _param_to_4bit   # type: ignore
+        Params4bit.cuda = _param_cuda_4bit
+    def _move():
+        if CUDASetup is not None: # pragma: no cover
+            CUDASetup._instance = None
+            importlib.reload(cextension)
+            functional.lib = cextension.lib
+        for op in to_ops_8bit.items():
+            tensor, parsed_args = op
+            if parsed_args:
+                _, dtype, _, memory_format = parsed_args
+            else:
+                dtype, memory_format = None, None
+            tensor.data = _param_to_8bit(tensor,
+                device='cuda',
+                dtype=dtype,
+                memory_format=memory_format,
+            ) # type: ignore
+        for op in to_ops_4bit.items():
+            tensor, parsed_args = op
+            if parsed_args:
+                _, dtype, _, memory_format = parsed_args
+            else:
+                dtype, memory_format = None, None
+            tensor.data = _param_to_4bit(tensor,
+                device='cuda',
+                dtype=dtype,
+                memory_format=memory_format,
+            ) # type: ignore
+else:
+    _patch = lambda: None
+    _unpatch = lambda: None
+    _move = lambda: None
+patch = _patch
+unpatch = _unpatch
+move = _move

spaces/zero/torch/cudart.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+"""
+from .static import CUDA_MEM_GET_INFO
+def cudaMemGetInfo(device: int, /):
+    return CUDA_MEM_GET_INFO

spaces/zero/torch/packing.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+"""
+from __future__ import annotations
+import time
+import ctypes
+import os
+from concurrent.futures import as_completed
+from concurrent.futures import ThreadPoolExecutor
+from contextvars import copy_context
+from dataclasses import dataclass
+from queue import Queue
+from typing import Callable
+from ...utils import debug
+import torch
+from typing_extensions import TypeAlias
+PAGE_SIZE = 4096
+TOTAL_MEMORY = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
+VM_MAX_SIZE = min(2**38, TOTAL_MEMORY // 2)
+BUFFER_SIZE = 64 * 2**20
+BUFFER_COUNT = 2
+TensorWithSizes: TypeAlias = 'tuple[torch.Tensor, int, int]'
+@dataclass
+class ZeroGPUTensorPack:
+    base_dir: str
+    batches: list[list[TensorWithSizes]]
+    big_tensors: list[TensorWithSizes]
+    fakes: dict[torch.Tensor, list[torch.Tensor]]
+    total_size: int
+    def path(self):
+        return f'{self.base_dir}/{id(self)}'
+    def __del__(self):
+        try:
+            os.remove(self.path())
+        except FileNotFoundError: # pragma: no cover
+            pass
+def write(fd: int, tensor: torch.Tensor):
+    clone = torch.empty_like(tensor)
+    size = clone.untyped_storage().size() # pyright: ignore [reportAttributeAccessIssue]
+    buffer = torch.UntypedStorage(VM_MAX_SIZE)
+    buffer_ptr = buffer.data_ptr()
+    offset = -buffer_ptr % PAGE_SIZE
+    padding = -size % PAGE_SIZE
+    clone.set_(buffer[offset:offset+size], 0, clone.shape, clone.stride()) # pyright: ignore [reportArgumentType]
+    clone.copy_(tensor)
+    mv = memoryview((ctypes.c_char * (size+padding)).from_address(buffer_ptr+offset))
+    written_bytes = 0
+    while written_bytes < size:
+        written_bytes += os.write(fd, mv[written_bytes:])
+def pack_tensors(
+    tensors: set[torch.Tensor],
+    fakes: dict[torch.Tensor, list[torch.Tensor]],
+    offload_dir: str,
+    callback: Callable[[int]] | None = None,
+):
+    callback = (lambda bytes: None) if callback is None else callback
+    batches: list[list[TensorWithSizes]] = []
+    big_tensors: list[TensorWithSizes] = []
+    tensors_with_sizes: list[tuple[torch.Tensor, int, int]] = []
+    for tensor in tensors:
+        size = tensor.numel() * tensor.element_size()
+        aligned_size = size + (-size % PAGE_SIZE)
+        tensors_with_sizes += [(tensor, size, aligned_size)]
+    current_batch, current_size = [], 0
+    for (tensor, size, aligned_size) in sorted(tensors_with_sizes, key=lambda item: item[2]):
+        if aligned_size > BUFFER_SIZE:
+            big_tensors += [(tensor, size, aligned_size)]
+            continue
+        current_size += aligned_size
+        if current_size > BUFFER_SIZE:
+            batches += [current_batch]
+            current_batch, current_size = [(tensor, size, aligned_size)], aligned_size
+        else:
+            current_batch += [(tensor, size, aligned_size)]
+    if current_batch:
+        batches += [current_batch]
+    get_meta = {tensor: torch.empty_like(tensor) for tensor in tensors}
+    batches_meta = [[(get_meta[tensor], size, asize) for tensor, size, asize in batch] for batch in batches]
+    big_tensors_meta = [(get_meta[tensor], size, asize) for tensor, size, asize in big_tensors]
+    fakes_meta = {get_meta[tensor]: fake_list for tensor, fake_list in fakes.items()}
+    pack = ZeroGPUTensorPack(
+        base_dir=offload_dir,
+        batches=batches_meta,
+        big_tensors=big_tensors_meta,
+        fakes=fakes_meta,
+        total_size=sum([size for _, size, _ in tensors_with_sizes]),
+    )
+    fd = os.open(pack.path(), os.O_CREAT | os.O_WRONLY | os.O_DIRECT)
+    try:
+        total_asize = sum([aligned_size for batch in batches for *_, aligned_size in batch])
+        total_asize += sum([aligned_size for *_, aligned_size in big_tensors])
+        if total_asize > 0:
+            os.posix_fallocate(fd, 0, total_asize)
+            for batch in batches:
+                for tensor, size, _ in batch:
+                    write(fd, tensor)
+                    callback(size)
+            for tensor, size, _ in big_tensors:
+                write(fd, tensor)
+                callback(size)
+        return pack
+    finally:
+        os.close(fd)
+def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = None):
+    callback = (lambda bytes: None) if callback is None else callback
+    free_buffers: Queue[torch.Tensor] = Queue()
+    read_buffers: Queue[torch.Tensor] = Queue()
+    for _ in range(BUFFER_COUNT):
+        free_buffers.put(torch.ByteTensor(BUFFER_SIZE).pin_memory())
+    def read(fd: int, buffer: torch.Tensor, size: int):
+        mv = memoryview((ctypes.c_char * size).from_address(buffer.data_ptr()))
+        read_bytes = 0
+        while read_bytes < size:
+            read_bytes += os.readv(fd, [mv[read_bytes:]])
+    def disk_to_pin(fd: int):
+        for batch in pack.batches:
+            buffer = free_buffers.get()
+            batch_size = sum([aligned_size for *_, aligned_size in batch])
+            read(fd, buffer, batch_size)
+            read_buffers.put(buffer)
+        for *_, aligned_size in pack.big_tensors:
+            read_bytes = 0
+            while read_bytes < aligned_size:
+                buffer = free_buffers.get()
+                read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
+                read(fd, buffer, read_size)
+                read_buffers.put(buffer)
+                read_bytes += read_size
+    def pin_to_cuda():
+        total_duration_in_callback = 0
+        for batch in pack.batches:
+            buffer = read_buffers.get()
+            offset = 0
+            cuda_storages = []
+            for tensor, size, aligned_size in batch:
+                cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
+                offset += aligned_size
+            torch.cuda.synchronize()
+            free_buffers.put(buffer)
+            batch_total_size = 0
+            for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
+                cuda_tensor = torch.tensor([], dtype=tensor.dtype, device='cuda')
+                cuda_tensor = cuda_tensor.set_(cuda_storage.untyped_storage(), 0, tensor.shape, tensor.stride())
+                for fake in pack.fakes[tensor]:
+                    fake.data = cuda_tensor
+                batch_total_size += size
+            t0 = time.perf_counter()
+            callback(batch_total_size)
+            total_duration_in_callback += time.perf_counter() - t0
+        for tensor, size, _ in pack.big_tensors:
+            cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
+            offset = 0
+            while offset < size:
+                buffer = read_buffers.get()
+                read_size = min(BUFFER_SIZE, size - offset)
+                cuda_storage[offset:offset+read_size] = buffer[:read_size]
+                offset += read_size
+                torch.cuda.synchronize() # Probably not needed
+                free_buffers.put(buffer)
+                t0 = time.perf_counter()
+                callback(read_size)
+                total_duration_in_callback += time.perf_counter() - t0
+            cuda_tensor = torch.tensor([], dtype=tensor.dtype, device='cuda')
+            cuda_tensor = cuda_tensor.set_(cuda_storage.untyped_storage(), 0, tensor.shape, tensor.stride())
+            for fake in pack.fakes[tensor]:
+                fake.data = cuda_tensor
+        debug(f"{total_duration_in_callback=}")
+    with ThreadPoolExecutor(2) as e:
+        fd = os.open(pack.path(), os.O_RDONLY | os.O_DIRECT)
+        try:
+            futures = [
+                e.submit(copy_context().run, disk_to_pin, fd),
+                e.submit(copy_context().run, pin_to_cuda),
+            ]
+            for future in as_completed(futures):
+                future.result()
+        finally:
+            os.close(fd)

spaces/zero/torch/patching.py ADDED Viewed

	@@ -0,0 +1,410 @@

+"""
+"""
+# pyright: reportPrivateImportUsage=false
+from __future__ import annotations
+import gc
+import multiprocessing
+import os
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import nullcontext
+from contextvars import copy_context
+from typing import Any
+from typing import Callable
+import torch
+from torch.overrides import TorchFunctionMode
+from torch.overrides import resolve_name
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map_only
+from torch.utils.weak import WeakTensorKeyDictionary
+from ...config import Config
+from ...utils import malloc_trim
+from ..tqdm import tqdm
+from . import cudart
+from .packing import ZeroGPUTensorPack
+from .packing import pack_tensors
+from .packing import pack_to_cuda
+from .static import *
+from .types import AliasId
+OPS_INPUTS_CHECK_NO_RETURN = (
+    torch.Tensor.equal,
+)
+OPS_INPUT_CHECK_SELF_RETURN = (
+    torch.Tensor.set_, # probably never dispatched
+    torch.ops.aten.set_.source_Tensor, # pyright: ignore [reportAttributeAccessIssue]
+)
+OFFLOADED_ERROR_MESSAGE = "Cannot apply function {} on disk-offloaded Tensor {}"
+_tensor_make_subclass = torch.Tensor._make_subclass
+_asarray           = torch.asarray
+_device            = torch.device
+_cuda_init         = torch._C._cuda_init
+_cuda_exchange_device = torch.cuda._exchange_device
+_cuda_available      = torch.cuda.is_available
+_cuda_device_count   = torch.cuda.device_count
+_cuda_current_device = torch.cuda.current_device
+_cuda_get_device_capability   = torch.cuda.get_device_capability
+_cuda_get_device_properties   = torch.cuda.get_device_properties
+_cuda_get_device_name         = torch.cuda.get_device_name
+_cuda_memory_stats_as_nested_dict = torch.cuda.memory.memory_stats_as_nested_dict
+_cuda_cudart = torch.cuda.cudart
+# PyTorch 2.3
+_cuda_maybe_exchange_device = getattr(torch.cuda, '_maybe_exchange_device', None)
+cuda_aliases: dict[torch.Tensor, torch.Tensor | None] = WeakTensorKeyDictionary() # pyright: ignore [reportAssignmentType]
+tensor_packs: list[ZeroGPUTensorPack] = []
+class ZeroGPUTensor(torch.Tensor):
+    pass
+def empty_fake(tensor: torch.Tensor):
+    fake = torch.empty_like(tensor, requires_grad=tensor.requires_grad)
+    if fake.__class__ != tensor.__class__:
+        fake = _tensor_make_subclass(tensor.__class__, fake, require_grad=tensor.requires_grad) # pyright: ignore [reportArgumentType]
+    return fake
+# Torch 2.5: https://github.com/pytorch/pytorch/issues/144152
+def no_int_device(*args, **kwargs):
+    if len(args) and isinstance(index := args[0], int):
+        args = (f'cuda:{index}', *args[1:])
+    if isinstance(index := kwargs.get('device'), int):
+        kwargs['device'] = f'cuda:{index}'
+    return args, kwargs
+class ZeroGPUFunctionMode(TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs: dict[str, Any] | None = None):
+        kwargs = {} if kwargs is None else kwargs
+        if func == torch._C._nn._parse_to:
+            args, kwargs = no_int_device(*args, **kwargs)
+            return func(*args, **kwargs)
+        # Redispatch: tensor.cuda() -> tensor.to(device='cuda')
+        if func == torch.Tensor.cuda or func == torch.Tensor.cpu:
+            memory_format = kwargs.get('memory_format')
+            return self.__torch_function__(torch.Tensor.to, types, (args[0],), {
+                'device': 'cuda' if func == torch.Tensor.cuda else 'cpu',
+                **({'memory_format': memory_format} if memory_format is not None else {}),
+            })
+        # Redispatch: tensor.to('cuda') -> tensor.to(device='cuda')
+        if func == torch.Tensor.to and len(args) > 1:
+            parse_to_args, parse_to_kwargs = no_int_device(*args[1:], **kwargs)
+            device, dtype, _, memory_format = torch._C._nn._parse_to(*parse_to_args, **parse_to_kwargs) # pyright: ignore [reportCallIssue, reportArgumentType]
+            return self.__torch_function__(torch.Tensor.to, types, (args[0],), {
+                'device': device,
+                'dtype': dtype,
+                'memory_format': memory_format,
+            })
+        if func == torch.Tensor.data.__set__: # pyright: ignore [reportAttributeAccessIssue]
+            self, target = args
+            if target in cuda_aliases:
+                if (target_original := cuda_aliases[target]) is None:
+                    raise Exception(OFFLOADED_ERROR_MESSAGE.format(resolve_name(func), target))
+                original = empty_fake(self)
+                original.data = target_original
+                cuda_aliases[self] = original
+            elif self in cuda_aliases:
+                del cuda_aliases[self]
+            self.data = target
+            return
+        if func == torch.Tensor.device.__get__:
+            tensor, = args
+            if tensor in cuda_aliases:
+                return torch.device('cuda', index=0)
+        elif func == torch.Tensor.__repr__:
+            tensor, = args
+            if tensor in cuda_aliases:
+                if (original := cuda_aliases[tensor]) is None:
+                    original = tensor.to('meta')
+                original_class = original.__class__
+                original.__class__ = ZeroGPUTensor
+                try:
+                    return func(original, **kwargs)
+                finally:
+                    original.__class__ = original_class
+        elif func == torch.Tensor.untyped_storage:
+            tensor, = args
+            if tensor in cuda_aliases:
+                if (original := cuda_aliases[tensor]) is None:
+                    raise Exception(OFFLOADED_ERROR_MESSAGE.format(resolve_name(func), tensor))
+                res = func(original, **kwargs)
+                res._zerogpu = True
+                return res
+        cuda: bool | None = None
+        # Handle device kwarg
+        if (device := kwargs.get('device')) is not None:
+            device = torch.device(device)
+            if device.type == 'cuda':
+                kwargs['device'] = torch.device('cpu')
+                cuda = True
+            else:
+                cuda = False
+        # Swap fake inputs with original data
+        swapped = {}
+        inputs_are_cuda = set()
+        def swap(tensor: torch.Tensor):
+            nonlocal inputs_are_cuda
+            if tensor not in cuda_aliases:
+                inputs_are_cuda |= {False}
+                return tensor
+            if (original := cuda_aliases[tensor]) is None:
+                raise Exception(OFFLOADED_ERROR_MESSAGE.format(resolve_name(func), tensor))
+            swapped[original] = tensor
+            inputs_are_cuda |= {True}
+            return original
+        args_ = tree_map_only(torch.Tensor, swap, args)
+        kwargs_ = tree_map_only(torch.Tensor, swap, kwargs)
+        if inputs_are_cuda == {True}:
+            if cuda is not False:
+                cuda = True
+        res = func(*args_, **kwargs_)
+        # Re-generate swapped fakes in case of mutation
+        for original, fake in swapped.items():
+            fake.data = empty_fake(original)
+        # Special case for Tensor indexing where only 'self' matters
+        if func in {
+            torch.ops.aten.index.Tensor, # pyright: ignore [reportAttributeAccessIssue]
+            torch.Tensor.__getitem__, # PyTorch 2.4+
+        }:
+            self = args[0]
+            cuda = self in cuda_aliases
+            inputs_are_cuda = {cuda}
+        # Emulate device check
+        if isinstance(res, torch.Tensor) or func in OPS_INPUTS_CHECK_NO_RETURN:
+            self = None
+            if len(args_) >= 1 and isinstance(args_[0], torch.Tensor):
+                self = args_[0]
+            # Only raise if func does not return its first input (Tensor.copy_)
+            if res is not self or func in OPS_INPUT_CHECK_SELF_RETURN:
+                if inputs_are_cuda == {True, False}:
+                    raise RuntimeError(
+                        "Expected all tensors to be on the same device, "
+                        "but found at least two devices, cuda:0 (ZeroGPU) and cpu!"
+                    )
+        # Register output
+        def register(tensor: torch.Tensor):
+            if tensor in swapped and cuda is not False:
+                return swapped[tensor]
+            if cuda is not True:
+                return tensor
+            fake = empty_fake(tensor)
+            cuda_aliases[fake] = tensor
+            return fake
+        return tree_map_only(torch.Tensor, register, res)
+# When enabling DispatchMode, some aten ops are dispatched to FunctionMode
+# We are using it for aten.alias.default and aten.set_.source_Tensor
+class DefaultDispatchMode(TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs: dict[str, Any] | None = None):
+        return func(*args, **(kwargs or {}))
+function_mode = ZeroGPUFunctionMode()
+dispatch_mode = DefaultDispatchMode()
+def _untyped_storage_new_register(*args, **kwargs):
+    cuda = False
+    if (device := kwargs.get('device')) is not None and device.type == 'cuda':
+        cuda = True
+        del kwargs['device']
+    storage = torch._C.StorageBase.__new__(*args, **kwargs)
+    if cuda:
+        storage._zerogpu = True
+    return storage
+@property
+def _untyped_storage_device(self):
+    if hasattr(self, '_zerogpu'):
+        return torch.device('cuda', index=0)
+    return torch._C.StorageBase.device.__get__(self) # pyright: ignore [reportAttributeAccessIssue]
+# Force dispatch
+def _tensor_make_subclass_function_mode(*args, **kwargs):
+    with torch._C.DisableTorchFunction():
+        return function_mode.__torch_function__(_tensor_make_subclass, (), args=args, kwargs=kwargs)
+def _asarray_function_mode(*args, **kwargs):
+    with torch._C.DisableTorchFunction():
+        return function_mode.__torch_function__(_asarray, (), args=args, kwargs=kwargs)
+class _DeviceStringOnlyMeta(type):
+    def __instancecheck__(cls, instance):
+        return isinstance(instance, _device)
+class _DeviceStringOnly(metaclass=_DeviceStringOnlyMeta):
+    def __new__(cls, *args, **kwargs):
+        args, kwargs = no_int_device(*args, **kwargs)
+        return _device(*args, **kwargs)
+def _cuda_init_raise():
+    raise RuntimeError(
+        "CUDA must not be initialized in the main process "
+        "on Spaces with Stateless GPU environment.\n"
+        "You can look at this Stacktrace to find out "
+        "which part of your code triggered a CUDA init"
+    )
+def _cuda_dummy_exchange_device(device):
+    assert device in {-1, 0}
+    return device
+def patch():
+    function_mode.__enter__()
+    dispatch_mode.__enter__()
+    # TODO: only patch bellow methods on current Thread to be consistent with TorchModes
+    # (or hijack threading.Thread.__init__ to force Modes on all threads)
+    torch.Tensor._make_subclass = _tensor_make_subclass_function_mode # pyright: ignore [reportAttributeAccessIssue]
+    torch.UntypedStorage.__new__ = _untyped_storage_new_register
+    torch.UntypedStorage.device  = _untyped_storage_device # pyright: ignore [reportAttributeAccessIssue]
+    torch.asarray           = _asarray_function_mode
+    torch.device            = _DeviceStringOnly
+    torch._C._cuda_init     = _cuda_init_raise
+    torch.cuda._exchange_device = _cuda_dummy_exchange_device
+    torch.cuda.is_available   = lambda: True
+    torch.cuda.device_count   = lambda: 1
+    torch.cuda.current_device = lambda: 0
+    torch.cuda.get_device_capability = lambda *args, **kwargs: CUDA_DEVICE_CAPABILITY
+    torch.cuda.get_device_properties = lambda *args, **kwargs: CUDA_DEVICE_PROPERTIES
+    torch.cuda.get_device_name       = lambda *args, **kwargs: CUDA_DEVICE_NAME
+    torch.cuda.memory.memory_stats_as_nested_dict = lambda *args, **kwargs: CUDA_MEMORY_STATS_AS_NESTED_DICT
+    torch.cuda.cudart = lambda: cudart
+    # PyTorch 2.3
+    if _cuda_maybe_exchange_device is not None: # pragma: no cover
+        setattr(torch.cuda, '_maybe_exchange_device', _cuda_dummy_exchange_device)
+    bitsandbytes().patch()
+def unpatch():
+    try:
+        dispatch_mode.__exit__(None, None, None)
+        function_mode.__exit__(None, None, None)
+    except RuntimeError:
+        pass # patch() and unpatch() called from != threads
+    torch.Tensor._make_subclass = _tensor_make_subclass
+    torch.UntypedStorage.__new__ = torch._C.StorageBase.__new__
+    torch.UntypedStorage.device  = torch._C.StorageBase.device # pyright: ignore [reportAttributeAccessIssue]
+    torch.asarray           = _asarray
+    torch.device            = _device
+    torch._C._cuda_init     = _cuda_init
+    torch.cuda._exchange_device = _cuda_exchange_device
+    torch.cuda.is_available   = _cuda_available
+    torch.cuda.device_count   = _cuda_device_count
+    torch.cuda.current_device = _cuda_current_device
+    torch.cuda.get_device_capability = _cuda_get_device_capability
+    torch.cuda.get_device_properties = _cuda_get_device_properties
+    torch.cuda.get_device_name       = _cuda_get_device_name
+    torch.cuda.memory.memory_stats_as_nested_dict = _cuda_memory_stats_as_nested_dict
+    torch.cuda.cudart = _cuda_cudart
+    # PyTorch 2.3
+    if _cuda_maybe_exchange_device is not None: # pragma: no cover
+        setattr(torch.cuda, '_maybe_exchange_device', _cuda_exchange_device)
+    bitsandbytes().unpatch()
+def _total_unpacked_size():
+    tensors = [tensor for tensor in cuda_aliases.values() if tensor is not None]
+    deduped = {AliasId.from_tensor(tensor): tensor for tensor in tensors}
+    return sum([tensor.numel() * tensor.element_size() for tensor in deduped.values()])
+def _pack(offload_dir: str):
+    # Pack to disk
+    originals: set[torch.Tensor] = set()
+    originals_dedup: dict[AliasId, torch.Tensor] = {}
+    fakes: dict[torch.Tensor, list[torch.Tensor]] = defaultdict(list)
+    for fake, original in cuda_aliases.items():
+        # TODO filter-out sparse Tensors
+        if original is not None:
+            original_id = AliasId.from_tensor(original)
+            if original_id not in originals_dedup:
+                originals_dedup[original_id] = original
+                originals |= {original}
+            fakes[originals_dedup[original_id]] += [fake]
+    progress = tqdm(
+        total=_total_unpacked_size(),
+        unit='B',
+        unit_scale=True,
+        desc="ZeroGPU tensors packing",
+    ) if tqdm is not None else nullcontext()
+    with progress as progress:
+        update = progress.update if progress is not None else lambda _: None
+        pack = pack_tensors(originals, fakes, offload_dir, callback=update)
+    tensor_packs.append(pack)
+    # Free memory
+    for fake_list in fakes.values():
+        for fake in fake_list:
+            cuda_aliases[fake] = None
+def pack():
+    _pack(Config.zerogpu_offload_dir)
+    gc.collect()
+    malloc_trim()
+def init(nvidia_uuid: str):
+    os.environ['CUDA_VISIBLE_DEVICES'] = nvidia_uuid
+    torch.Tensor([0]).cuda()
+def size():
+    return _total_unpacked_size() + sum([pack.total_size for pack in tensor_packs])
+def _move(callback: Callable[[int]] | None = None):
+    callback = callback if callback is not None else lambda _: None
+    # CPU -> CUDA
+    moved: dict[AliasId, torch.Tensor] = {}
+    for fake, original in cuda_aliases.items():
+        if original is not None:
+            original_id = AliasId.from_tensor(original)
+            if original_id not in moved:
+                moved[original_id] = original.cuda()
+                callback(fake.numel() * fake.element_size())
+    for fake, original in cuda_aliases.items():
+        if original is not None:
+            fake.data = moved[AliasId.from_tensor(original)]
+    # Disk -> CUDA
+    for tensor_pack in tensor_packs:
+        pack_to_cuda(tensor_pack, callback=callback)
+    bitsandbytes().move()
+def move(callback: Callable[[int]] | None = None):
+    callback = callback if callback is not None else lambda _: None
+    with ThreadPoolExecutor(1) as e:
+        e.submit(copy_context().run, _move, callback=callback).result()
+    torch.cuda.synchronize()
+def is_in_bad_fork():
+    with ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as e:
+        f = e.submit(torch.cuda._is_in_bad_fork)
+        return f.result()
+def bitsandbytes():
+    # Lazy import
+    from . import bitsandbytes
+    return bitsandbytes

spaces/zero/torch/patching_legacy.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+"""
+# pyright: reportPrivateImportUsage=false
+from __future__ import annotations
+import multiprocessing
+import os
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import suppress
+from functools import partial
+from types import SimpleNamespace
+from typing import Any
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+import torch
+from torch.utils.weak import WeakTensorKeyDictionary
+from ...config import Config
+from . import bitsandbytes
+# Nvidia A100.80G MIG (drivers 535) / Torch 2.2.0
+CUDA_DEVICE_NAME = 'NVIDIA A100-SXM4-80GB MIG 3g.40gb'
+CUDA_TOTAL_MEMORY = 42144366592
+CUDA_MEM_GET_INFO = (41911451648, CUDA_TOTAL_MEMORY)
+CUDA_DEVICE_CAPABILITY = (8, 0)
+CUDA_DEVICE_PROPERTIES = SimpleNamespace(name=CUDA_DEVICE_NAME, major=8, minor=0, total_memory=CUDA_TOTAL_MEMORY, multi_processor_count=42)
+GENERIC_METHOD_NAMES = [
+    'arange',
+    'as_tensor',
+    'asarray',
+    'bartlett_window',
+    'blackman_window',
+    'empty',
+    'empty_like',
+    'empty_strided',
+    'eye',
+    'full',
+    'full_like',
+    'hamming_window',
+    'hann_window',
+    'kaiser_window',
+    'linspace',
+    'logspace',
+    'ones',
+    'ones_like',
+    'rand',
+    'rand_like',
+    'randint',
+    'randint_like',
+    'randn',
+    'randn_like',
+    'randperm',
+    'range',
+    'sparse_bsc_tensor',
+    'sparse_bsr_tensor',
+    'sparse_compressed_tensor',
+    'sparse_coo_tensor',
+    'sparse_csc_tensor',
+    'sparse_csr_tensor',
+    'tensor',
+    'tril_indices',
+    'triu_indices',
+    'zeros',
+    'zeros_like',
+]
+TO_CUDA = (torch.device('cuda'), None, False, None)
+_tensor__deepcopy__ = torch.Tensor.__deepcopy__
+_tensor_to         = torch.Tensor.to
+_tensor_cuda       = torch.Tensor.cuda
+_tensor_cpu        = torch.Tensor.cpu
+_torch_generics    = {name: getattr(torch, name) for name in GENERIC_METHOD_NAMES}
+_cuda_init         = torch._C._cuda_init
+_cuda_available      = torch.cuda.is_available
+_cuda_device_count   = torch.cuda.device_count
+_cuda_current_device = torch.cuda.current_device
+_cuda_mem_get_info   = torch.cuda.mem_get_info
+_cuda_get_device_capability   = torch.cuda.get_device_capability
+_cuda_get_device_properties   = torch.cuda.get_device_properties
+_cuda_get_device_name         = torch.cuda.get_device_name
+TensorToArgs = Tuple[Optional[torch.device], Optional[torch.dtype], bool, Optional[torch.memory_format]]
+to_ops: dict[torch.Tensor, TensorToArgs] = WeakTensorKeyDictionary() # type: ignore
+def _tensor_new_register(*args, **kwargs):
+    new_tensor: torch.Tensor = torch._C._TensorBase.__new__(*args, **kwargs)
+    if (base_tensor := new_tensor._base) is not None:
+        if base_tensor in to_ops:
+            to_ops[new_tensor] = to_ops[base_tensor]
+    return new_tensor
+def _tensor_deepcopy_register(self: torch.Tensor, memo):
+    new_tensor = _tensor__deepcopy__(self, memo)
+    if isinstance(new_tensor, torch.Tensor):
+        if self in to_ops:
+            to_ops[new_tensor] = to_ops[self]
+    return new_tensor
+@property
+def _tensor_device_property(self: torch.Tensor):
+    if self in to_ops:
+        return torch.device(type='cuda', index=0)
+    del torch.Tensor.device
+    try:
+        return self.device
+    finally:
+        torch.Tensor.device = _tensor_device_property # type: ignore
+@property
+def _tensor_dtype_property(self: torch.Tensor):
+    if self in to_ops:
+        if (to_dtype := to_ops[self][1]) is not None:
+            return to_dtype
+    del torch.Tensor.dtype
+    try:
+        return self.dtype
+    finally:
+        torch.Tensor.dtype = _tensor_dtype_property # type: ignore
+def _to_op_register(self: torch.Tensor, *args, **kwargs):
+    parsed = torch._C._nn._parse_to(*args, **kwargs)
+    device, dtype, *_ = parsed
+    try:
+        to_args = to_ops.pop(self)
+    except KeyError:
+        to_args = None
+    if device is None: # pyright: ignore [reportUnnecessaryComparison]
+        if to_args is not None:
+            to_ops[self] = (to_args[0], dtype, *to_args[2:])
+            return self
+        return _tensor_to(self, *args, **kwargs)
+    if device.type != 'cuda':
+        if to_args is not None:
+            if (to_dtype := to_args[1]) is not None:
+                kwargs = {'dtype': to_dtype, **kwargs}
+        return _tensor_to(self, *args, **kwargs)
+    to_ops[self] = parsed
+    return self
+def _cuda_op_arg_check(device: torch.device | int | str | None) -> bool:
+    if device is None:
+        return True
+    if isinstance(device, int):
+        return True
+    if isinstance(device, str):
+        device = torch.device(device)
+    return device.type == 'cuda'
+def _cuda_op_register(self: torch.Tensor, device: torch.device | int | str | None = None, **kwargs):
+    if not _cuda_op_arg_check(device):
+        # Let PyTorch handle the fail
+        return _tensor_cuda(self, device, **kwargs)
+    to_ops[self] = TO_CUDA
+    return self
+def _cpu_op_remove(self: torch.Tensor, **kwargs):
+    try:
+        to_args = to_ops.pop(self)
+    except KeyError:
+        to_args = None
+    if to_args is not None:
+        if (to_dtype := to_args[1]) is not None:
+            return _tensor_to(self, 'cpu', **{'dtype': to_dtype, **kwargs})
+    return _tensor_cpu(self, **kwargs)
+def _cuda_init_raise():
+    raise RuntimeError(
+        "CUDA must not be initialized in the main process "
+        "on Spaces with Stateless GPU environment.\n"
+        "You can look at this Stacktrace to find out "
+        "which part of your code triggered a CUDA init"
+    )
+def _generic_method_register(name: str, *args: Any, **kwargs: Any):
+    try:
+        device = torch.device(kwargs.get('device', "cpu"))
+    except Exception:
+        return _torch_generics[name](*args, **kwargs)
+    if device.type != 'cuda':
+        return _torch_generics[name](*args, **kwargs)
+    tensor = _torch_generics[name](*args, **{**kwargs, 'device': "cpu"})
+    to_ops[tensor] = TO_CUDA
+    return tensor
+def patch():
+    torch.Tensor.__deepcopy__ = _tensor_deepcopy_register
+    torch.Tensor.__new__      = _tensor_new_register # pyright: ignore [reportAttributeAccessIssue]
+    torch.Tensor.to         = _to_op_register   # type: ignore
+    torch.Tensor.cuda       = _cuda_op_register # type: ignore
+    torch.Tensor.cpu        = _cpu_op_remove # type: ignore
+    if Config.zero_patch_torch_device:
+        torch.Tensor.device = _tensor_device_property # type: ignore
+        torch.Tensor.dtype  = _tensor_dtype_property # pyright: ignore [reportAttributeAccessIssue]
+    for name in GENERIC_METHOD_NAMES:
+        setattr(torch, name, partial(_generic_method_register, name))
+    torch._C._cuda_init     = _cuda_init_raise
+    torch.cuda.is_available   = lambda: True
+    torch.cuda.device_count   = lambda: 1
+    torch.cuda.current_device = lambda: 0
+    torch.cuda.mem_get_info   = lambda *args, **kwargs: CUDA_MEM_GET_INFO
+    torch.cuda.get_device_capability = lambda *args, **kwargs: CUDA_DEVICE_CAPABILITY
+    torch.cuda.get_device_properties = lambda *args, **kwargs: CUDA_DEVICE_PROPERTIES
+    torch.cuda.get_device_name       = lambda *args, **kwargs: CUDA_DEVICE_NAME
+    bitsandbytes.patch()
+def unpatch():
+    torch.Tensor.__deepcopy__ = _tensor__deepcopy__
+    with suppress(AttributeError):
+        del torch.Tensor.__new__
+    torch.Tensor.to         = _tensor_to
+    torch.Tensor.cuda       = _tensor_cuda
+    torch.Tensor.cpu        = _tensor_cpu
+    with suppress(AttributeError):
+        del torch.Tensor.device
+    with suppress(AttributeError):
+        del torch.Tensor.dtype
+    for name in GENERIC_METHOD_NAMES:
+        setattr(torch, name, _torch_generics[name])
+    torch._C._cuda_init     = _cuda_init
+    torch.cuda.is_available   = _cuda_available
+    torch.cuda.device_count   = _cuda_device_count
+    torch.cuda.current_device = _cuda_current_device
+    torch.cuda.mem_get_info   = _cuda_mem_get_info
+    torch.cuda.get_device_capability = _cuda_get_device_capability
+    torch.cuda.get_device_properties = _cuda_get_device_properties
+    torch.cuda.get_device_name       = _cuda_get_device_name
+    bitsandbytes.unpatch()
+def pack():
+    pass
+def init(nvidia_uuid: str):
+    os.environ['CUDA_VISIBLE_DEVICES'] = nvidia_uuid
+    torch.Tensor([0]).cuda() # CUDA init
+def size():
+    return 0
+def move(callback: Callable[[int]] | None = None):
+    for op in to_ops.items():
+        tensor, parsed_args = op
+        _, dtype, _, memory_format = parsed_args
+        tensor.data = _tensor_to(tensor,
+            device='cuda',
+            dtype=dtype,
+            memory_format=memory_format,
+        ) # type: ignore
+    bitsandbytes.move()
+    torch.cuda.synchronize()
+def is_in_bad_fork():
+    with ProcessPoolExecutor(mp_context=multiprocessing.get_context('fork')) as e:
+        f = e.submit(torch.cuda._is_in_bad_fork)
+        return f.result()
+def disable_cuda_intercept():
+    torch.Tensor.to   = _tensor_to
+    torch.Tensor.cuda = _tensor_cuda

spaces/zero/torch/static.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+"""
+from types import SimpleNamespace as _SimpleNamespace
+import torch as _torch
+from ...config import Config
+def compute_base_free_memory(total_memory: int):
+    pytorch_base_memory = 309002240 # TODO: fine-grain per: torch-version x GPU(-MIG) model
+    return total_memory - pytorch_base_memory - Config.zerogpu_cuda_reserved_memory
+CUDA_DEVICE_NAME = Config.zerogpu_cuda_device_name
+CUDA_TOTAL_MEMORY = Config.zerogpu_cuda_total_memory
+CUDA_MEM_GET_INFO = (compute_base_free_memory(CUDA_TOTAL_MEMORY), CUDA_TOTAL_MEMORY)
+CUDA_DEVICE_CAPABILITY = (Config.zerogpu_cuda_capability_major, Config.zerogpu_cuda_capability_minor)
+CUDA_DEVICE_PROPERTIES = _SimpleNamespace(
+    name=CUDA_DEVICE_NAME,
+    major=CUDA_DEVICE_CAPABILITY[0],
+    minor=CUDA_DEVICE_CAPABILITY[1],
+    total_memory=CUDA_TOTAL_MEMORY,
+    multi_processor_count=Config.zerogpu_cuda_multi_processor_count,
+    # TODO: L2_cache_size
+)
+if _torch.version.cuda.startswith("12."): # pyright: ignore [reportAttributeAccessIssue]
+    CUDA_MEMORY_STATS_AS_NESTED_DICT = {
+        "num_alloc_retries": 0,
+        "num_ooms": 0,
+        "max_split_size": -1,
+        "num_sync_all_streams": 0,
+        "num_device_alloc": 0,
+        "num_device_free": 0,
+        "allocation": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "segment": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "active": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "inactive_split": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "allocated_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "reserved_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "active_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "inactive_split_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "requested_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "oversize_allocations": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        "oversize_segments": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+    }
+else: # pragma: no cover (CUDA 11)
+    CUDA_MEMORY_STATS_AS_NESTED_DICT = {
+        "num_alloc_retries": 0,
+        "num_ooms": 0,
+        "max_split_size": -1,
+        "allocation": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "segment": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "active": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "inactive_split": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "allocated_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "reserved_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "active_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "inactive_split_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "requested_bytes": {
+            "all": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "small_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+            "large_pool": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        },
+        "oversize_allocations": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+        "oversize_segments": {"current": 0, "peak": 0, "allocated": 0, "freed": 0},
+    }

spaces/zero/torch/types.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+"""
+from __future__ import annotations
+from typing import NamedTuple
+import torch
+class AliasId(NamedTuple):
+    data_ptr: int
+    dtype: torch.dtype
+    shape: tuple[int, ...]
+    stride: tuple[int, ...]
+    @classmethod
+    def from_tensor(cls, tensor: torch.Tensor):
+        return cls(
+            tensor.data_ptr(),
+            tensor.dtype,
+            tensor.shape,
+            tensor.stride(),
+        )

spaces/zero/tqdm.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+"""
+from multiprocessing.synchronize import RLock as MultiprocessingRLock
+try:
+    from tqdm import tqdm as _tqdm
+except ImportError: # pragma: no cover
+    _tqdm = None
+def remove_tqdm_multiprocessing_lock():
+    if _tqdm is None: # pragma: no cover
+        return
+    tqdm_lock = _tqdm.get_lock()
+    assert tqdm_lock.__class__.__name__ == 'TqdmDefaultWriteLock'
+    tqdm_lock.locks = [
+        lock for lock in tqdm_lock.locks
+        if not isinstance(lock, MultiprocessingRLock)
+    ]
+tqdm = _tqdm

spaces/zero/types.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from datetime import timedelta
+from typing import Any
+from typing import Dict
+from typing import Tuple
+from typing import TypedDict
+from typing_extensions import Callable
+from typing_extensions import Generic
+from typing_extensions import ParamSpec
+from typing_extensions import TypeAlias
+from typing_extensions import TypeVar
+Params = Tuple[Tuple[object, ...], Dict[str, Any]]
+Res = TypeVar('Res')
+Param = ParamSpec('Param')
+class EmptyKwargs(TypedDict):
+    pass
+@dataclass
+class OkResult(Generic[Res]):
+    value: Res
+@dataclass
+class ExceptionResult:
+    traceback: str
+    error_cls: str
+@dataclass
+class AbortedResult:
+    pass
+@dataclass
+class EndResult:
+    pass
+@dataclass
+class GradioQueueEvent:
+    method_name: str
+    args: tuple[Any, ...]
+    kwargs: dict[str, Any]
+RegularResQueueResult:   TypeAlias = "OkResult[Res] | ExceptionResult | GradioQueueEvent"
+GeneratorResQueueResult: TypeAlias = "OkResult[Res] | ExceptionResult | EndResult | GradioQueueEvent"
+YieldQueueResult:        TypeAlias = "OkResult[Res] | ExceptionResult | EndResult | AbortedResult"
+Duration:        TypeAlias = "int | timedelta"
+DynamicDuration: TypeAlias = "Duration | Callable[Param, Duration] | None"

spaces/zero/wrappers.py ADDED Viewed

	@@ -0,0 +1,423 @@

+"""
+"""
+from __future__ import annotations
+import multiprocessing
+import os
+import signal
+import traceback
+import warnings
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import nullcontext
+from contextvars import copy_context
+from datetime import timedelta
+from functools import partial
+from functools import wraps
+from multiprocessing.context import ForkProcess
+from pickle import PicklingError
+from queue import Empty
+from queue import Queue as ThreadQueue
+from threading import Thread
+from typing import TYPE_CHECKING
+from typing import Callable
+from typing import Generator
+from typing import Generic
+from typing_extensions import assert_never
+import psutil
+from ..config import Config
+from ..utils import debug
+from ..utils import drop_params
+from ..utils import gradio_request_var
+from ..utils import SimpleQueue as Queue
+from . import client
+from . import torch
+from .api import AllowToken
+from .api import NvidiaIndex
+from .api import NvidiaUUID
+from .gradio import GradioPartialContext
+from .gradio import error
+from .gradio import get_server_port
+from .gradio import patch_gradio_queue
+from .gradio import try_process_queue_event
+from .tqdm import remove_tqdm_multiprocessing_lock
+from .tqdm import tqdm
+from .types import * # TODO: Please don't do that
+GENERATOR_GLOBAL_TIMEOUT = 20 * 60
+SPAWN_PROGRESS_CLEANUP = 0.1
+SPAWN_PROGRESS_INIT = 0.1
+Process = multiprocessing.get_context('fork').Process
+forked = False
+class Worker(Generic[Res]):
+    process: ForkProcess
+    arg_queue: Queue[tuple[Params, GradioPartialContext]]
+    res_queue: Queue[Res | None]
+    _sentinel: Thread
+    def __init__(
+        self,
+        target: Callable[[
+            Queue[tuple[Params, GradioPartialContext]],
+            Queue[Res | None],
+            AllowToken,
+            NvidiaUUID,
+            list[int],
+        ], None],
+        allow_token: str,
+        nvidia_uuid: str,
+    ):
+        self._sentinel = Thread(target=self._close_on_exit, daemon=True)
+        self.arg_queue = Queue()
+        self.res_queue = Queue()
+        debug(f"{self.arg_queue._writer.fileno()=}") # pyright: ignore [reportAttributeAccessIssue]
+        debug(f"{self.res_queue._writer.fileno()=}") # pyright: ignore [reportAttributeAccessIssue]
+        if (server_port := get_server_port()) is not None:
+            fds = [c.fd for c in psutil.Process().connections() if c.laddr.port == server_port]
+            debug(f"{fds=}")
+        else:
+            warnings.warn("Using a ZeroGPU function outside of Gradio caching or request might block the app")
+            fds = []
+        args = self.arg_queue, self.res_queue, allow_token, nvidia_uuid, fds
+        if TYPE_CHECKING:
+            target(*args)
+        self.process = Process(
+            target=target,
+            args=args,
+            daemon=True,
+        )
+        self.process.start()
+        self._sentinel.start()
+    def _close_on_exit(self):
+        self.process.join()
+        self.arg_queue.close()
+        self.res_queue.wlock_release()
+        self.res_queue.put(None)
+def worker_init(
+    res_queue: Queue[RegularResQueueResult | None] | Queue[GeneratorResQueueResult | None],
+    allow_token: str,
+    nvidia_uuid: str,
+    fds: list[int],
+) -> None | ExceptionResult:
+    # Immediately close file descriptors
+    for fd in fds:
+        try:
+            os.close(fd)
+        except Exception as e: # pragma: no cover
+            if isinstance(e, OSError) and e.errno == 9:
+                continue
+            return exception_result(e)
+    try:
+        remove_tqdm_multiprocessing_lock()
+    except Exception: # pragma: no cover
+        print("Error while trying to remove tqdm mp_lock:")
+        traceback.print_exc()
+    progress = nullcontext()
+    if tqdm is not None and Config.zero_gpu_v2:
+        progress = tqdm(total=100, desc="ZeroGPU init", file=open(os.devnull, 'w'))
+    try: # Unrecoverable init part
+        patch_gradio_queue(res_queue)
+        with progress as progress:
+            current_progress = 0 # Gradio does not support float progress updates
+            def update(n: float):
+                nonlocal current_progress
+                current_progress += n
+                if progress is not None:
+                    progress.update(round(current_progress * 100) - progress.n)
+            client.allow(allow_token)
+            update(SPAWN_PROGRESS_CLEANUP)
+            torch.unpatch()
+            torch.init(nvidia_uuid)
+            update(SPAWN_PROGRESS_INIT)
+            callback = None
+            if (transfer_size := torch.size()) > 0:
+                remaining = 1 - (SPAWN_PROGRESS_CLEANUP + SPAWN_PROGRESS_INIT)
+                callback = lambda n: update(n * remaining / transfer_size)
+            torch.move(callback=callback)
+    except Exception as e: # pragma: no cover
+        return exception_result(e)
+def process_duration(duration: Duration | None):
+    if duration is None or isinstance(duration, timedelta):
+        return duration
+    return timedelta(seconds=duration)
+def static_duration(duration: DynamicDuration[Param], *args: Param.args, **kwargs: Param.kwargs):
+    if not callable(duration):
+        return duration
+    return duration(*args, **kwargs)
+def regular_function_wrapper(
+    task: Callable[Param, Res],
+    duration: DynamicDuration[Param],
+) -> Callable[Param, Res]:
+    import gradio as gr
+    request_var = gradio_request_var()
+    workers: dict[NvidiaIndex, Worker[RegularResQueueResult[Res] | None]] = {}
+    task_id = id(task)
+    @wraps(task)
+    def gradio_handler(*args: Param.args, **kwargs: Param.kwargs) -> Res:
+        if forked:
+            return task(*args, **kwargs)
+        request = request_var.get()
+        duration_ = static_duration(duration, *args, **kwargs)
+        duration_ = process_duration(duration_)
+        schedule_response = client.schedule(task_id=task_id, request=request, duration=duration_)
+        allow_token = schedule_response.allowToken
+        nvidia_index = schedule_response.nvidiaIndex
+        nvidia_uuid = schedule_response.nvidiaUUID
+        release = partial(client.release, allow_token)
+        try:
+            worker = workers.pop(nvidia_index)
+        except KeyError:
+            worker = None
+        if worker is not None and worker.process.is_alive() and schedule_response.idle:
+            assert worker.arg_queue.empty()
+            assert worker.res_queue.empty()
+        else:
+            worker = Worker(thread_wrapper, allow_token, nvidia_uuid)
+        try:
+            worker.arg_queue.put(((args, kwargs), GradioPartialContext.get()))
+        except PicklingError: # TODO: detailed serialization diagnostic
+            release(fail=True)
+            raise
+        while True:
+            res = worker.res_queue.get()
+            if res is None:
+                release(fail=True, allow_404=True)
+                raise error("ZeroGPU worker error", "GPU task aborted")
+            if isinstance(res, ExceptionResult):
+                release(fail=True)
+                print(res.traceback)
+                raise error("ZeroGPU worker error", res.error_cls)
+            if isinstance(res, OkResult):
+                release()
+                workers[nvidia_index] = worker
+                return res.value
+            if isinstance(res, GradioQueueEvent):
+                try_process_queue_event(res.method_name, *res.args, **res.kwargs)
+                continue
+            assert_never(res)
+    def thread_wrapper(
+        arg_queue: Queue[tuple[Params, GradioPartialContext]],
+        res_queue: Queue[RegularResQueueResult[Res] | None],
+        allow_token: str,
+        nvidia_uuid: str,
+        fds: list[int],
+    ):
+        global forked
+        forked = True
+        signal.signal(signal.SIGTERM, drop_params(arg_queue.close))
+        initialized = False
+        while True:
+            try:
+                (args, kwargs), gradio_context = arg_queue.get()
+            except OSError:
+                break
+            if not initialized:
+                if (res := worker_init(
+                    res_queue=res_queue,
+                    allow_token=allow_token,
+                    nvidia_uuid=nvidia_uuid,
+                    fds=fds,
+                )) is not None:
+                    res_queue.put(res)
+                    return
+                initialized = True
+            GradioPartialContext.apply(gradio_context)
+            context = copy_context()
+            with ThreadPoolExecutor() as executor:
+                future = executor.submit(context.run, task, *args, **kwargs) # type: ignore
+            try:
+                res = future.result()
+            except Exception as e:
+                res = exception_result(e)
+            else:
+                res = OkResult(res)
+            try:
+                res_queue.put(res)
+            except PicklingError as e:
+                res_queue.put(exception_result(e))
+    # https://github.com/python/cpython/issues/91002
+    if not hasattr(task, '__annotations__'):
+        gradio_handler.__annotations__ = {}
+    return gradio_handler
+def generator_function_wrapper(
+    task: Callable[Param, Generator[Res, None, None]],
+    duration: DynamicDuration[Param],
+) -> Callable[Param, Generator[Res, None, None]]:
+    import gradio as gr
+    request_var = gradio_request_var()
+    workers: dict[NvidiaIndex, Worker[GeneratorResQueueResult[Res] | None]] = {}
+    task_id = id(task)
+    @wraps(task)
+    def gradio_handler(*args: Param.args, **kwargs: Param.kwargs) -> Generator[Res, None, None]:
+        if forked:
+            yield from task(*args, **kwargs)
+            return
+        request = request_var.get()
+        duration_ = static_duration(duration, *args, **kwargs)
+        duration_ = process_duration(duration_)
+        schedule_response = client.schedule(task_id=task_id, request=request, duration=duration_)
+        allow_token = schedule_response.allowToken
+        nvidia_index = schedule_response.nvidiaIndex
+        nvidia_uuid = schedule_response.nvidiaUUID
+        release = partial(client.release, allow_token)
+        try:
+            worker = workers.pop(nvidia_index)
+        except KeyError:
+            worker = None
+        if worker is not None and worker.process.is_alive() and schedule_response.idle:
+            assert worker.arg_queue.empty()
+            assert worker.res_queue.empty()
+        else:
+            worker = Worker(thread_wrapper, allow_token, nvidia_uuid)
+        try:
+            worker.arg_queue.put(((args, kwargs), GradioPartialContext.get()))
+        except PicklingError: # TODO: detailed serialization diagnostic
+            release(fail=True)
+            raise
+        yield_queue: ThreadQueue[YieldQueueResult[Res]] = ThreadQueue()
+        def fill_yield_queue(worker: Worker[GeneratorResQueueResult[Res] | None]):
+            while True:
+                res = worker.res_queue.get()
+                if res is None:
+                    release(fail=True, allow_404=True)
+                    yield_queue.put(AbortedResult())
+                    return
+                if isinstance(res, ExceptionResult):
+                    release(fail=True)
+                    yield_queue.put(res)
+                    return
+                if isinstance(res, EndResult):
+                    release()
+                    workers[nvidia_index] = worker
+                    yield_queue.put(EndResult())
+                    return
+                if isinstance(res, OkResult):
+                    yield_queue.put(OkResult(res.value))
+                    continue
+                if isinstance(res, GradioQueueEvent): # pragma: no cover (not working properly on Gradio side)
+                    try_process_queue_event(res.method_name, *res.args, **res.kwargs)
+                    continue
+                debug(f"fill_yield_queue: assert_never({res=})")
+                assert_never(res)
+        from typing_extensions import assert_never
+        with ThreadPoolExecutor() as e:
+            f = e.submit(copy_context().run, fill_yield_queue, worker)
+            f.add_done_callback(lambda _: debug("fill_yield_queue DONE"))
+            while True:
+                try:
+                    res = yield_queue.get(timeout=GENERATOR_GLOBAL_TIMEOUT)
+                except Empty: # pragma: no cover
+                    debug(f"yield_queue TIMEOUT ({GENERATOR_GLOBAL_TIMEOUT=})")
+                    raise
+                if isinstance(res, AbortedResult):
+                    raise error("ZeroGPU worker error", "GPU task aborted")
+                if isinstance(res, ExceptionResult):
+                    print(res.traceback)
+                    raise error("ZeroGPU worker error", res.error_cls)
+                if isinstance(res, EndResult):
+                    break
+                if isinstance(res, OkResult):
+                    yield res.value
+                    continue
+                debug(f"gradio_handler: assert_never({res=})")
+                assert_never(res)
+    def thread_wrapper(
+        arg_queue: Queue[tuple[Params, GradioPartialContext]],
+        res_queue: Queue[GeneratorResQueueResult[Res] | None],
+        allow_token: str,
+        nvidia_uuid: str,
+        fds: list[int],
+    ):
+        global forked
+        forked = True
+        signal.signal(signal.SIGTERM, drop_params(arg_queue.close))
+        initialized = False
+        while True:
+            try:
+                (args, kwargs), gradio_context = arg_queue.get()
+            except OSError:
+                break
+            if not initialized:
+                if (res := worker_init(
+                    res_queue=res_queue,
+                    allow_token=allow_token,
+                    nvidia_uuid=nvidia_uuid,
+                    fds=fds,
+                )) is not None:
+                    res_queue.put(res)
+                    return
+                initialized = True
+            def iterate():
+                gen = task(*args, **kwargs) # type: ignore
+                while True:
+                    try:
+                        res = next(gen)
+                    except StopIteration:
+                        break
+                    except Exception as e:
+                        res_queue.put(exception_result(e))
+                        break
+                    try:
+                        res_queue.put(OkResult(res))
+                    except PicklingError as e:
+                        res_queue.put(exception_result(e))
+                        break
+                    else:
+                        continue
+            GradioPartialContext.apply(gradio_context)
+            with ThreadPoolExecutor() as executor:
+                executor.submit(copy_context().run, iterate)
+            res_queue.put(EndResult())
+    # https://github.com/python/cpython/issues/91002
+    if not hasattr(task, '__annotations__'):
+        gradio_handler.__annotations__ = {}
+    return gradio_handler
+def exception_result(exc: Exception) -> ExceptionResult:
+    formatted = traceback.format_exception(type(exc), exc, exc.__traceback__)
+    return ExceptionResult(traceback=''.join(formatted), error_cls=exc.__class__.__name__)