"Attention Mask Not Set"

#3
by trbielec - opened

I tried running this on A10G and wasn't able to generate any tokens. Logs below:

Running on local URL: http://0.0.0.0:7860

To create a public link, set share=True in launch().
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Exception in thread Thread-9 (generate):
Traceback (most recent call last):
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
result = self._sample(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/transformers/generation/utils.py", line 2982, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 801, in forward
transformer_outputs = self.transformer(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 707, in forward
hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 551, in forward
layer_ret = layer(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 454, in forward
attention_output, kv_cache = self.self_attention(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 351, in forward
context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 211, in forward
context_layer = flash_attn_unpadded_func(
TypeError: 'NoneType' object is not callable
Traceback (most recent call last):
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/queueing.py", line 536, in process_events
response = await route_utils.call_process_api(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/route_utils.py", line 288, in call_process_api
output = await app.get_blocks().process_api(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/blocks.py", line 1931, in process_api
result = await self.call_function(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/blocks.py", line 1528, in call_function
prediction = await utils.async_iteration(iterator)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration
return await iterator.anext()
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/utils.py", line 664, in anext
return await anyio.to_thread.run_sync(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
return await future
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
result = context.run(func, *args)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/utils.py", line 647, in run_sync_iterator_async
return next(iterator)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/utils.py", line 809, in gen_wrapper
response = next(iterator)
File "/home/user/app/app.py", line 67, in predict
for new_token in streamer:
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/transformers/generation/streamers.py", line 223, in next
value = self.text_queue.get(timeout=self.timeout)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/queue.py", line 179, in get
raise Empty
_queue.Empty
Exception in thread Thread-10 (generate):
Traceback (most recent call last):
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/transformers/generation/utils.py", line 2024, in generate
result = self._sample(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/transformers/generation/utils.py", line 2982, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 801, in forward
transformer_outputs = self.transformer(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 707, in forward
hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 551, in forward
layer_ret = layer(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 454, in forward
attention_output, kv_cache = self.self_attention(
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 351, in forward
context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/user/.pyenv/versions/3.10.14/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/home/user/.cache/huggingface/modules/transformers_modules/THUDM/longwriter-glm4-9b/81b025e373d163efd7908a787b3fb907424c6184/modeling_chatglm.py", line 211, in forward
context_layer = flash_attn_unpadded_func(
TypeError: 'NoneType' object is not callable

Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University org

Hi, you need to install FlashAttention2 in your environment.

All I did was duplicate the space on HG. Do the requirements/imports need to be updated?

Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University org
edited Aug 17

Good news! We've updated the modeling_chatglm.py to get rid of the dependency on FlashAttention2. You can download the newest code and directly deploy the model.

Sign up or log in to comment