ERROR text_generation_launcher: Error when initializing model
#1
by
coltonbehannon
- opened
Hello,
When running text-generation-inference with:text-generation-launcher --model-id TheBloke/neural-chat-7B-v3-2-GPTQ --revision main --quantize gptq
everything works great and it is able to run.
However, when I run:text-generation-launcher --model-id TheBloke/neural-chat-7B-v3-2-GPTQ --revision gptq-4bit-32g-actorder_True --quantize gptq
I get the following error:
ERROR text_generation_launcher: Error when initializing model
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.9/site-packages/typer/main.py", line 311, in __call__
return get_command(self)(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/typer/core.py", line 778, in main
return _main(
File "/opt/conda/lib/python3.9/site-packages/typer/core.py", line 216, in _main
rv = self.invoke(ctx)
File "/opt/conda/lib/python3.9/site-packages/click/core.py", line 1688, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/opt/conda/lib/python3.9/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/opt/conda/lib/python3.9/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/opt/conda/lib/python3.9/site-packages/typer/main.py", line 683, in wrapper
return callback(**use_params) # type: ignore
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/cli.py", line 83, in serve
server.serve(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 207, in serve
asyncio.run(
File "/opt/conda/lib/python3.9/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.9/asyncio/base_events.py", line 634, in run_until_complete
self.run_forever()
File "/opt/conda/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
self._run_once()
File "/opt/conda/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
handle._run()
File "/opt/conda/lib/python3.9/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
> File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 159, in serve_inner
model = get_model(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/__init__.py", line 252, in get_model
return FlashMistral(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_mistral.py", line 321, in __init__
model = FlashMistralForCausalLM(config, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 480, in __init__
self.model = MistralModel(config, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 416, in __init__
[
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 417, in <listcomp>
MistralLayer(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 351, in __init__
self.self_attn = MistralAttention(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 227, in __init__
self.query_key_value = load_attention(config, prefix, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 156, in load_attention
return _load_gqa(config, prefix, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 189, in _load_gqa
get_linear(weight, bias=None, quantize=config.quantize)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/layers.py", line 311, in get_linear
linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/gptq/exllama.py", line 115, in __init__
assert groupsize == self.groupsize
AssertionError
2023-12-06T22:44:50.981264Z ERROR shard-manager: text_generation_launcher: Shard complete standard error output:
Traceback (most recent call last):
File "/opt/conda/bin/text-generation-server", line 8, in <module>
sys.exit(app())
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/cli.py", line 83, in serve
server.serve(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 207, in serve
asyncio.run(
File "/opt/conda/lib/python3.9/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/opt/conda/lib/python3.9/asyncio/base_events.py", line 647, in run_until_complete
return future.result()
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/server.py", line 159, in serve_inner
model = get_model(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/__init__.py", line 252, in get_model
return FlashMistral(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/flash_mistral.py", line 321, in __init__
model = FlashMistralForCausalLM(config, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 480, in __init__
self.model = MistralModel(config, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 416, in __init__
[
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 417, in <listcomp>
MistralLayer(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 351, in __init__
self.self_attn = MistralAttention(
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 227, in __init__
self.query_key_value = load_attention(config, prefix, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 156, in load_attention
return _load_gqa(config, prefix, weights)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/models/custom_modeling/flash_mistral_modeling.py", line 189, in _load_gqa
get_linear(weight, bias=None, quantize=config.quantize)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/layers.py", line 311, in get_linear
linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
File "/opt/conda/lib/python3.9/site-packages/text_generation_server/utils/gptq/exllama.py", line 115, in __init__
assert groupsize == self.groupsize
AssertionError
rank=0
2023-12-06T22:44:51.079122Z ERROR text_generation_launcher: Shard 0 failed to start
2023-12-06T22:44:51.079162Z INFO text_generation_launcher: Shutting down shards
Error: ShardCannotStart
Any suggestions?
This is using text-generation-inference 1.0