rwkv-v5-1b5-cpu

Sleeping

BlinkDL commited on Mar 3, 2023

Commit

61b9ff7

1 Parent(s): f0b8656

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,8 +2,11 @@ import gradio as gr
 import os
 from datetime import datetime
 from huggingface_hub import hf_hub_download
-title = "RWKV-4 14B fp16 ctx4096"
 desc = '''Links:
 <a href='https://github.com/BlinkDL/ChatRWKV' target="_blank" style="margin:0 1em">ChatRWKV</a>
 <a href='https://github.com/BlinkDL/RWKV-LM' target="_blank" style="margin:0 1em">RWKV-LM</a>
@@ -14,8 +17,10 @@ os.environ["RWKV_JIT_ON"] = '1'
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
 from rwkv.model import RWKV
-model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename="RWKV-4-Pile-14B-20230227-ctx4096-test503.pth")
-model = RWKV(model=model_path, strategy='cuda fp16 *32 -> cpu fp32')
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")
@@ -39,13 +44,16 @@ def infer(
     else:
         ctx = f'\n{ctx.strip()}'
     all_tokens = []
     out_last = 0
     out_str = ''
     occurrence = {}
     state = None
     for i in range(int(token_count)):
-        out, state = model.forward(pipeline.encode(ctx)[:4096] if i == 0 else [token], state)
         for n in args.token_ban:
             out[n] = -float('inf')
         for n in occurrence:

 import os
 from datetime import datetime
 from huggingface_hub import hf_hub_download
+from pynvml import *
+nvmlInit()
+gpu_h = nvmlDeviceGetHandleByIndex(0)
+title = "RWKV-4 14B fp16 ctx1024"
 desc = '''Links:
 <a href='https://github.com/BlinkDL/ChatRWKV' target="_blank" style="margin:0 1em">ChatRWKV</a>
 <a href='https://github.com/BlinkDL/RWKV-LM' target="_blank" style="margin:0 1em">RWKV-LM</a>
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
 from rwkv.model import RWKV
+# model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-14b", filename="RWKV-4-Pile-14B-20230213-8019.pth")
+# model = RWKV(model=model_path, strategy='cuda fp16 *34 -> cpu fp32')
+model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-pile-169m", filename="RWKV-4-Pile-169M-20220807-8023.pth")
+model = RWKV(model=model_path, strategy='cuda fp16')
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
 pipeline = PIPELINE(model, "20B_tokenizer.json")
     else:
         ctx = f'\n{ctx.strip()}'
+    gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
+    print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
     all_tokens = []
     out_last = 0
     out_str = ''
     occurrence = {}
     state = None
     for i in range(int(token_count)):
+        out, state = model.forward(pipeline.encode(ctx)[:1024] if i == 0 else [token], state)
         for n in args.token_ban:
             out[n] = -float('inf')
         for n in occurrence: