Spaces:
Sleeping
Sleeping
Gary
commited on
Commit
·
27e1332
1
Parent(s):
d43e767
Reduce document size
Browse files- app.py +8 -3
- indexer.py +7 -5
app.py
CHANGED
@@ -32,10 +32,15 @@ class CustomRAG:
|
|
32 |
|
33 |
def answer_question(query):
|
34 |
docs = load_raw_dataset()
|
|
|
|
|
|
|
|
|
|
|
35 |
rag = CustomRAG(
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
)
|
40 |
response, _ = rag.run(query)
|
41 |
|
|
|
32 |
|
33 |
def answer_question(query):
|
34 |
docs = load_raw_dataset()
|
35 |
+
llm = get_llm("google/flan-t5-base")
|
36 |
+
vector_database = create_vector_database(
|
37 |
+
docs, "sentence-transformers/all-MiniLM-L6-v2"
|
38 |
+
)
|
39 |
+
prompt_template = get_prompt_template()
|
40 |
rag = CustomRAG(
|
41 |
+
vector_database,
|
42 |
+
llm,
|
43 |
+
prompt_template,
|
44 |
)
|
45 |
response, _ = rag.run(query)
|
46 |
|
indexer.py
CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
|
|
3 |
from langchain.schema import Document
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.vectorstores import FAISS
|
6 |
-
from transformers import AutoTokenizer,
|
7 |
from langchain.llms import HuggingFacePipeline
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
|
@@ -15,6 +15,8 @@ def load_raw_dataset():
|
|
15 |
|
16 |
df["combined"] = df["input"] + " " + df["output"]
|
17 |
|
|
|
|
|
18 |
docs = [
|
19 |
Document(
|
20 |
page_content=row["combined"],
|
@@ -34,15 +36,15 @@ def create_vector_database(docs, model_name):
|
|
34 |
|
35 |
def get_llm(model_name):
|
36 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
37 |
-
model =
|
38 |
-
|
39 |
)
|
40 |
|
41 |
pipe = pipeline(
|
42 |
-
"
|
43 |
model=model,
|
44 |
tokenizer=tokenizer,
|
45 |
-
max_new_tokens=
|
46 |
temperature=0.7,
|
47 |
do_sample=True,
|
48 |
)
|
|
|
3 |
from langchain.schema import Document
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.vectorstores import FAISS
|
6 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
7 |
from langchain.llms import HuggingFacePipeline
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
|
|
|
15 |
|
16 |
df["combined"] = df["input"] + " " + df["output"]
|
17 |
|
18 |
+
df = df.sample(n=min(5000, len(df)), random_state=42).reset_index(drop=True)
|
19 |
+
|
20 |
docs = [
|
21 |
Document(
|
22 |
page_content=row["combined"],
|
|
|
36 |
|
37 |
def get_llm(model_name):
|
38 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
39 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(
|
40 |
+
"google/flan-t5-base", torch_dtype="auto", device_map="auto"
|
41 |
)
|
42 |
|
43 |
pipe = pipeline(
|
44 |
+
"text2text-generation",
|
45 |
model=model,
|
46 |
tokenizer=tokenizer,
|
47 |
+
max_new_tokens=1024,
|
48 |
temperature=0.7,
|
49 |
do_sample=True,
|
50 |
)
|