safikhan commited on
Commit
e174cbc
·
verified ·
1 Parent(s): 614b49e

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +3 -67
README.md CHANGED
@@ -1,67 +1,3 @@
1
- ---
2
- license: cc-by-4.0
3
- ---
4
-
5
- Sample Code
6
- ```
7
- import requests
8
- import torch
9
- import os
10
- from tqdm import tqdm
11
- from transformers import AutoProcessor, Gemma3ForConditionalGeneration, AutoTokenizer
12
-
13
-
14
- model_id = "ai4bharat/IndicTrans3-gemma-beta"
15
- language = "Hindi"
16
- # permitted languages = Assamese, Bengali, English, Gujarati, Hindi, Kannada, Malayalam, Marathi, Nepali, Odia, Punjabi, Sanskrit, Tamil, Telugu, Urdu
17
-
18
- model = Gemma3ForConditionalGeneration.from_pretrained(
19
- model_id, device_map="auto", attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16
20
- ).eval()
21
-
22
- tokenizer = AutoTokenizer.from_pretrained(model_id)
23
-
24
- src = [
25
- "When I was young, I used to go to the park every day.",
26
- "We watched a new movie last week, which was very inspiring.",
27
- "If you had met me at that time, we would have gone out to eat.",
28
- "My friend has invited me to his birthday party, and I will give him a gift."
29
- ]
30
-
31
- _PROMPT = (
32
- "<bos><start_of_turn>user\n"
33
- "Translate the following text to {tgt_lang}: {source_text}:"
34
- "<end_of_turn>\n<start_of_turn>model\n"
35
- )
36
-
37
- batch_size = 100 # Adjust based on memory constraints
38
- outputs = []
39
-
40
- for i in tqdm(range(0, len(src), batch_size)):
41
- batch = src[i:i + batch_size]
42
-
43
- batch = [
44
- _PROMPT.format(
45
- tgt_lang=language,
46
- source_text=s
47
- )
48
- for s in batch
49
- ]
50
- tokinp = tokenizer(batch, return_tensors='pt', padding="longest")
51
- for k in tokinp:
52
- tokinp[k] = tokinp[k].to("cuda")
53
-
54
- out = model.generate(
55
- **tokinp,
56
- max_new_tokens=8192,
57
- num_beams=1,
58
- do_sample=False
59
- )
60
-
61
- for b, o in zip(batch, out):
62
- input_length = len(tokenizer(b)['input_ids'])
63
- finout = tokenizer.decode(o, skip_special_tokens=True)
64
- outputs.append(finout.split('model')[-1].strip())
65
-
66
- print(outputs)
67
- ```
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---