VietCat commited on
Commit
b54a7c5
·
1 Parent(s): c4fe2d6

split text into trunk to fit the token length of 256

Browse files
Files changed (1) hide show
  1. app.py +43 -11
app.py CHANGED
@@ -1,11 +1,48 @@
1
  from flask import Flask, request, jsonify
2
- from transformers import TFAutoModel, AutoTokenizer
 
 
3
 
4
  app = Flask(__name__)
5
 
6
  # Load PhoBERT (TensorFlow version)
7
- tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
8
- model = TFAutoModel.from_pretrained("vinai/phobert-base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  @app.route('/embed', methods=['POST'])
11
  def embed():
@@ -14,17 +51,12 @@ def embed():
14
  if not text:
15
  return jsonify({"error": "No text provided"}), 400
16
 
17
- inputs = tokenizer(text, return_tensors="tf") # Chuyển sang TensorFlow tensor
18
- outputs = model(**inputs)
19
-
20
- # Lấy embedding từ hidden state đầu tiên
21
- embedding = outputs.last_hidden_state[:, 0, :].numpy().tolist() # Dùng .numpy() để chuyển từ TensorFlow tensor sang list
22
-
23
- return jsonify({"embeddings": embedding})
24
 
25
  @app.route('/', methods=['GET'])
26
  def index():
27
- return "PhoBERT Space is running!"
28
 
29
  if __name__ == "__main__":
30
  app.run(host="0.0.0.0", port=7860)
 
1
  from flask import Flask, request, jsonify
2
+ from transformers import AutoTokenizer, TFAutoModel
3
+ import tensorflow as tf
4
+ import numpy as np
5
 
6
  app = Flask(__name__)
7
 
8
  # Load PhoBERT (TensorFlow version)
9
+ MODEL_NAME = "vinai/phobert-base"
10
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
+ model = TFAutoModel.from_pretrained(MODEL_NAME)
12
+
13
+ MAX_LEN = 256
14
+ STRIDE = 128
15
+
16
+ def split_text_into_chunks(text):
17
+ tokens = tokenizer.encode(text, add_special_tokens=True)
18
+ chunks = []
19
+ for i in range(0, len(tokens), STRIDE):
20
+ chunk = tokens[i:i + MAX_LEN]
21
+ if len(chunk) < MAX_LEN:
22
+ chunk += [tokenizer.pad_token_id] * (MAX_LEN - len(chunk))
23
+ chunks.append(chunk)
24
+ if i + MAX_LEN >= len(tokens):
25
+ break
26
+ return chunks
27
+
28
+ def embed_text(text):
29
+ chunks = split_text_into_chunks(text)
30
+ embeddings = []
31
+
32
+ for chunk in chunks:
33
+ input_ids = tf.constant([chunk])
34
+ attention_mask = tf.cast(input_ids != tokenizer.pad_token_id, tf.int32)
35
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
36
+
37
+ hidden_states = outputs.last_hidden_state
38
+ mask = tf.cast(tf.expand_dims(attention_mask, -1), tf.float32)
39
+ summed = tf.reduce_sum(hidden_states * mask, axis=1)
40
+ count = tf.reduce_sum(mask, axis=1)
41
+ mean_pooled = summed / count
42
+ embeddings.append(mean_pooled.numpy()[0])
43
+
44
+ final_embedding = np.mean(embeddings, axis=0)
45
+ return final_embedding.tolist()
46
 
47
  @app.route('/embed', methods=['POST'])
48
  def embed():
 
51
  if not text:
52
  return jsonify({"error": "No text provided"}), 400
53
 
54
+ embedding = embed_text(text)
55
+ return jsonify({"embedding": embedding})
 
 
 
 
 
56
 
57
  @app.route('/', methods=['GET'])
58
  def index():
59
+ return "PhoBERT vector API is running!"
60
 
61
  if __name__ == "__main__":
62
  app.run(host="0.0.0.0", port=7860)